├── .github └── workflows │ └── actions.yml ├── .gitignore ├── .golangci.yml ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── Makefile ├── NOTICE ├── README.md ├── artifacts └── kubectl.template ├── boilerplate └── lyft │ ├── docker_build │ ├── Makefile │ ├── Readme.rst │ └── docker_build.sh │ └── golang_test_targets │ ├── 81868GOPATH │ └── bin │ │ └── golangci-lint │ ├── Makefile │ ├── Readme.rst │ ├── goimports │ └── golangci-lint.sh ├── cmd └── flinkk8soperator │ ├── cmd │ └── root.go │ └── main.go ├── config ├── config.yaml └── test │ └── flinkk8soperator_config.yaml ├── deploy ├── config.yaml ├── crd.yaml ├── flinkk8soperator.yaml ├── flinkk8soperator_local.yaml ├── namespace.yaml ├── role-binding.yaml └── role.yaml ├── docs ├── blue_green_state_machine.mmd ├── blue_green_state_machine.png ├── crd.md ├── dual_state_machine.mmd ├── dual_state_machine.png ├── flink-operator-overview.svg ├── local_dev.md ├── quick-start-guide.md ├── state_machine.md ├── user_guide.md └── who-is-using.md ├── examples ├── README.md ├── beam-python │ ├── Dockerfile │ ├── README.md │ ├── docker-entrypoint.sh │ ├── flink-operator-custom-resource.yaml │ └── src │ │ ├── beam_example │ │ ├── __init__.py │ │ └── pipeline.py │ │ └── requirements.txt └── wordcount │ ├── Dockerfile │ ├── flink-operator-custom-resource.yaml │ ├── pom.xml │ └── src │ └── main │ └── java │ └── org │ └── apache │ └── flink │ ├── WordCount.java │ └── util │ └── WordCountData.java ├── go.mod ├── go.sum ├── integ ├── README.md ├── blue_green_deployment_test.go ├── checkpoint_failure_test.go ├── install.sh ├── job_cancellation_test.go ├── log │ └── log.go ├── main_test.go ├── operator-test-app │ ├── Dockerfile │ ├── docker-entrypoint.sh │ ├── flink-conf.yaml │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── lyft │ │ ├── OperatorTestApp.java │ │ └── Settings.java ├── scaleup_test.go ├── setup.sh ├── simple_test.go ├── test.sh ├── test_app.yaml └── utils │ └── utils.go ├── local_config.yaml ├── pkg ├── apis │ └── app │ │ ├── addtoscheme_v1beta1.go │ │ ├── apis.go │ │ ├── v1alpha1 │ │ ├── doc.go │ │ ├── register.go │ │ ├── types.go │ │ └── zz_generated.deepcopy.go │ │ └── v1beta1 │ │ ├── doc.go │ │ ├── register.go │ │ ├── types.go │ │ └── zz_generated.deepcopy.go ├── client │ └── clientset │ │ └── versioned │ │ ├── clientset.go │ │ ├── doc.go │ │ ├── fake │ │ ├── clientset_generated.go │ │ ├── doc.go │ │ └── register.go │ │ ├── scheme │ │ ├── doc.go │ │ └── register.go │ │ └── typed │ │ └── app │ │ └── v1beta1 │ │ ├── app_client.go │ │ ├── doc.go │ │ ├── fake │ │ ├── doc.go │ │ ├── fake_app_client.go │ │ └── fake_flinkapplication.go │ │ ├── flinkapplication.go │ │ └── generated_expansion.go └── controller │ ├── add_flinkapplication.go │ ├── common │ └── utils.go │ ├── config │ ├── config.go │ ├── config_flags.go │ ├── config_flags_test.go │ └── runtime_config.go │ ├── controller.go │ ├── errors │ ├── codes.go │ └── error.go │ ├── flink │ ├── client │ │ ├── api.go │ │ ├── api_test.go │ │ ├── entities.go │ │ ├── error_handler.go │ │ ├── error_handler_test.go │ │ └── mock │ │ │ └── mock_api.go │ ├── config.go │ ├── config_test.go │ ├── container_utils.go │ ├── container_utils_test.go │ ├── flink.go │ ├── flink_test.go │ ├── ingress.go │ ├── ingress_test.go │ ├── job_manager_controller.go │ ├── job_manager_controller_test.go │ ├── mock │ │ ├── mock_error_handler.go │ │ ├── mock_flink.go │ │ ├── mock_job_manager_controller.go │ │ └── mock_task_manager_controller.go │ ├── task_manager_controller.go │ └── task_manager_controller_test.go │ ├── flinkapplication │ ├── controller.go │ ├── flink_state_machine.go │ └── flink_state_machine_test.go │ └── k8 │ ├── cluster.go │ ├── mock │ └── mock_k8.go │ ├── utils.go │ └── utils_test.go ├── script └── lint ├── tmp ├── build │ ├── Dockerfile │ ├── build.sh │ └── docker_build.sh └── codegen │ ├── boilerplate.go.txt │ └── update-generated.sh ├── tools.go └── version └── version.go /.github/workflows/actions.yml: -------------------------------------------------------------------------------- 1 | name: checks 2 | on: 3 | push: 4 | branches: [ master ] 5 | pull_request: 6 | branches: [ master ] 7 | jobs: 8 | unit-tests: 9 | runs-on: ubuntu-22.04 10 | defaults: 11 | run: 12 | working-directory: go/src/github.com/lyft/flinkk8soperator 13 | env: 14 | GOPATH: "/home/runner/work/flinkk8soperator/flinkk8soperator/go/" 15 | steps: 16 | - name: checkout 17 | uses: actions/checkout@v2 18 | with: 19 | fetch-depth: 1 20 | path: go/src/github.com/lyft/flinkk8soperator 21 | - name: install go 22 | uses: actions/setup-go@v2 23 | with: 24 | go-version: "1.20" 25 | - name: test 26 | run: make test_unit 27 | lint: 28 | runs-on: ubuntu-22.04 29 | defaults: 30 | run: 31 | working-directory: go/src/github.com/lyft/flinkk8soperator 32 | env: 33 | GOPATH: "/home/runner/work/flinkk8soperator/flinkk8soperator/go/" 34 | steps: 35 | - name: checkout 36 | uses: actions/checkout@v2 37 | with: 38 | fetch-depth: 1 39 | path: go/src/github.com/lyft/flinkk8soperator 40 | - name: install go 41 | uses: actions/setup-go@v2 42 | with: 43 | go-version: "1.20" 44 | - name: test 45 | run: make lint 46 | integration-tests: 47 | runs-on: ubuntu-22.04 48 | defaults: 49 | run: 50 | working-directory: go/src/github.com/lyft/flinkk8soperator 51 | env: 52 | GOPATH: "/home/runner/work/flinkk8soperator/flinkk8soperator/go/" 53 | steps: 54 | - name: checkout 55 | uses: actions/checkout@v2 56 | with: 57 | fetch-depth: 1 58 | path: go/src/github.com/lyft/flinkk8soperator 59 | - name: install go 60 | uses: actions/setup-go@v2 61 | with: 62 | go-version: "1.20" 63 | - name: install 64 | run: integ/install.sh 65 | - name: setup 66 | run: integ/setup.sh 67 | - name: test 68 | run: PATH=$PATH GOPATH=$GOPATH integ/test.sh 69 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Temporary Build Files 3 | tmp/_output 4 | tmp/_test 5 | 6 | 7 | # Created by https://www.gitignore.io/api/go,vim,emacs,visualstudiocode 8 | 9 | ### Emacs ### 10 | # -*- mode: gitignore; -*- 11 | *~ 12 | \#*\# 13 | /.emacs.desktop 14 | /.emacs.desktop.lock 15 | *.elc 16 | auto-save-list 17 | tramp 18 | .\#* 19 | 20 | # Org-mode 21 | .org-id-locations 22 | *_archive 23 | 24 | # flymake-mode 25 | *_flymake.* 26 | 27 | # eshell files 28 | /eshell/history 29 | /eshell/lastdir 30 | 31 | # elpa packages 32 | /elpa/ 33 | 34 | # reftex files 35 | *.rel 36 | 37 | # AUCTeX auto folder 38 | /auto/ 39 | 40 | # cask packages 41 | .cask/ 42 | dist/ 43 | 44 | # Flycheck 45 | flycheck_*.el 46 | 47 | # server auth directory 48 | /server/ 49 | 50 | # projectiles files 51 | .projectile 52 | projectile-bookmarks.eld 53 | 54 | # directory configuration 55 | .dir-locals.el 56 | 57 | # saveplace 58 | places 59 | 60 | # url cache 61 | url/cache/ 62 | 63 | # cedet 64 | ede-projects.el 65 | 66 | # smex 67 | smex-items 68 | 69 | # company-statistics 70 | company-statistics-cache.el 71 | 72 | # anaconda-mode 73 | anaconda-mode/ 74 | 75 | ### Go ### 76 | # Binaries for programs and plugins 77 | *.exe 78 | *.exe~ 79 | *.dll 80 | *.so 81 | *.dylib 82 | 83 | # Test binary, build with 'go test -c' 84 | *.test 85 | 86 | # Output of the go coverage tool, specifically when used with LiteIDE 87 | *.out 88 | 89 | ### Vim ### 90 | # swap 91 | .sw[a-p] 92 | .*.sw[a-p] 93 | # session 94 | Session.vim 95 | # temporary 96 | .netrwhist 97 | # auto-generated tag files 98 | tags 99 | 100 | ### VisualStudioCode ### 101 | .vscode/* 102 | !.vscode/settings.json 103 | !.vscode/tasks.json 104 | !.vscode/launch.json 105 | !.vscode/extensions.json 106 | .history 107 | 108 | 109 | # End of https://www.gitignore.io/api/go,vim,emacs,visualstudiocode 110 | .idea 111 | vendor 112 | bin 113 | 114 | 115 | .idea/ 116 | .DS_Store 117 | *.iml 118 | vendor/ 119 | examples/wordcount/target/ 120 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | run: 2 | skip-dirs: 3 | - pkg/client 4 | - vendor/ 5 | linters: 6 | disable-all: true 7 | enable: 8 | - deadcode 9 | - errcheck 10 | - gas 11 | - goconst 12 | - goimports 13 | - golint 14 | - gosimple 15 | - govet 16 | - ineffassign 17 | - misspell 18 | - nakedret 19 | - staticcheck 20 | - structcheck 21 | - typecheck 22 | - unconvert 23 | - unparam 24 | - unused 25 | - varcheck 26 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # These owners will be the default owners for everything in 2 | # the repo. Unless a later match takes precedence. 3 | * @anandswaminathan @premsantosh @maghamravi @sethsaperstein-lyft @leoluoInSea 4 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | This project is governed by [Lyft's code of conduct](https://github.com/lyft/code-of-conduct). 2 | All contributors and participants agree to abide by its terms. 3 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.20.2-alpine3.17 as builder 2 | RUN apk add git openssh-client make curl bash 3 | 4 | # COPY only the dep files for efficient caching 5 | COPY go.mod go.sum /go/src/github.com/lyft/flinkk8soperator/ 6 | WORKDIR /go/src/github.com/lyft/flinkk8soperator 7 | 8 | # Pull dependencies 9 | RUN go mod download 10 | 11 | # COPY the rest of the source code 12 | COPY . /go/src/github.com/lyft/flinkk8soperator/ 13 | 14 | # This 'linux_compile' target should compile binaries to the /artifacts directory 15 | # The main entrypoint should be compiled to /artifacts/flinkk8soperator 16 | RUN go mod vendor && make linux_compile 17 | 18 | # update the PATH to include the /artifacts directory 19 | ENV PATH="/artifacts:${PATH}" 20 | 21 | # This will eventually move to centurylink/ca-certs:latest for minimum possible image size 22 | FROM alpine:3.17 23 | COPY --from=builder /artifacts /bin 24 | CMD ["flinkoperator"] 25 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | export REPOSITORY=flinkk8soperator 2 | include boilerplate/lyft/docker_build/Makefile 3 | include boilerplate/lyft/golang_test_targets/Makefile 4 | 5 | .PHONY: generate 6 | generate: 7 | tmp/codegen/update-generated.sh 8 | 9 | .PHONY: compile 10 | compile: generate 11 | mkdir -p ./bin 12 | go build -o bin/flinkoperator ./cmd/flinkk8soperator/main.go 13 | 14 | .PHONY: linux_compile 15 | linux_compile: generate 16 | GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o /artifacts/flinkoperator ./cmd/flinkk8soperator/main.go 17 | 18 | gen-config: 19 | which pflags || (go get github.com/lyft/flytestdlib/cli/pflags) 20 | @go generate ./... 21 | 22 | all: compile 23 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | flinkk8soperator 2 | Copyright 2019-2020 Lyft Inc. 3 | 4 | This product includes software developed at Lyft Inc. 5 | 6 | Notices for file(s): 7 | examples/wordcount/src/main/java/org/apache/flink/ contains work from https://github.com/apache/flink under the Apache2 license. 8 | 9 | /* 10 | Copyright 2016 The Kubernetes Authors. 11 | Licensed under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance with the License. 13 | You may obtain a copy of the License at 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | Unless required by applicable law or agreed to in writing, software 16 | distributed under the License is distributed on an "AS IS" BASIS, 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | See the License for the specific language governing permissions and 19 | limitations under the License. 20 | */ 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Current Release](https://img.shields.io/github/release/lyft/flinkk8soperator.svg)](https://github.com/lyft/flinkk8soperator/releases/latest) 2 | [![Build Status](https://travis-ci.org/lyft/flinkk8soperator.svg?branch=master)](https://travis-ci.org/lyft/flinkk8soperator) 3 | [![GoDoc](https://godoc.org/github.com/lyft/flinkk8soperator?status.svg)](https://godoc.org/github.com/lyft/flinkk8soperator) 4 | [![License](https://img.shields.io/badge/LICENSE-Apache2.0-ff69b4.svg)](http://www.apache.org/licenses/LICENSE-2.0.html) 5 | [![CodeCoverage](https://img.shields.io/codecov/c/github/lyft/flinkk8soperator.svg)](https://codecov.io/gh/lyft/flinkk8soperator) 6 | [![Go Report Card](https://goreportcard.com/badge/github.com/lyft/flinkk8soperator)](https://goreportcard.com/report/github.com/lyft/flinkk8soperator) 7 | ![Commit activity](https://img.shields.io/github/commit-activity/w/lyft/flinkk8soperator.svg?style=plastic) 8 | ![Commit since last release](https://img.shields.io/github/commits-since/lyft/flinkk8soperator/latest.svg?style=plastic) 9 | [![Slack](https://img.shields.io/badge/slack-join_chat-white.svg?logo=slack&style=social)](http://go.lyft.com/flinkoperator_slack) 10 | 11 | 12 | # Flinkk8soperator 13 | FlinkK8sOperator is a [Kubernetes operator](https://coreos.com/operators/) that manages [Flink](https://flink.apache.org/) applications on Kubernetes. The operator acts as control plane to manage the complete deployment lifecycle of the application. 14 | 15 | 16 | ## Project Status 17 | 18 | *Beta* 19 | 20 | The operator is in use for some less-critical jobs at Lyft. At this point the focus is on testing and stability. While in 21 | Beta, we will attempt to limit the number of backwards-incompatible changes, but they may still occur as necessary. 22 | 23 | ## Prerequisites 24 | * Version >= 1.10 of Kubernetes (versions < 1.13 require `--feature-gates=CustomResourceSubresources=true`) 25 | * Version >= 1.7 of Apache Flink. 26 | 27 | ## Overview 28 | 29 | ![Flink operator overview](docs/flink-operator-overview.svg) 30 | 31 | The goal of running Flink on Kubernetes is to enable more flexible, lighter-weight deployment of streaming applications, without needing to manage infrastructure. The Flink operator aims to abstract out the complexity of hosting, configuring, managing and operating Flink clusters from application developers. It achieves this by extending any kubernetes cluster using [custom resources](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources). 32 | 33 | The Operator creates flink clusters dynamically using the specified custom resource. Flink clusters in kubernetes consist of the following: 34 | * JobManager [Deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) 35 | * TaskManager [Deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) 36 | * JobManager [Service](https://kubernetes.io/docs/concepts/services-networking/service/) 37 | * JobManager [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) for the UI (optional) 38 | 39 | Deploying and managing Flink applications in Kubernetes involves two steps: 40 | 41 | * **Building Flink application packaged as a docker image:** A docker image is built containing the application source code with the necessary dependencies built in. This is required to bootstrap the Jobmanager and Taskmanager pods. At Lyft we use Source-To-Image [S2I](https://docs.openshift.com/container-platform/3.11/architecture/core_concepts/builds_and_image_streams.html#source-build) as the image build tool that provides a common builder image with Apache Flink pre-installed. The docker image could be built using any pre-existing workflows at an organization. 42 | 43 | * **Creating the Flink application custom resource:** The custom resource for Flink application provides the spec for configuring and managing flink clusters in Kubernetes. The FlinkK8sOperator, deployed on Kubernetes, continuously monitors the resource and the corresponding flink cluster, and performs actions based on the diff. 44 | 45 | ## Documentation 46 | 47 | * [Quick start guide](/docs/quick-start-guide.md) 48 | * [User guide](/docs/user_guide.md) 49 | * [Flink application custom resource](/docs/crd.md) 50 | * [Operator state machine](/docs/state_machine.md) 51 | 52 | ## Community 53 | 54 | * Join our [Slack](http://go.lyft.com/flinkoperator_slack) channel. 55 | * Check out [who is using FlinkK8sOperator](docs/who-is-using.md). 56 | 57 | We welcome you to contribute and make the operator better! For questions, and changes please create an [issue](https://github.com/lyft/flinkk8soperator/issues/new) or submit a [pull request](https://github.com/lyft/flinkk8soperator/compare). 58 | -------------------------------------------------------------------------------- /artifacts/kubectl.template: -------------------------------------------------------------------------------- 1 | Namespace Name Status CreatedAt LastUpdatedAt Reason 2 | .metadata.namespace .metadata.name .status.phase .metadata.creationTimestamp .status.last_updated_at .status.reason 3 | -------------------------------------------------------------------------------- /boilerplate/lyft/docker_build/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: docker_build 2 | docker_build: 3 | IMAGE_NAME=$$REPOSITORY ./boilerplate/lyft/docker_build/docker_build.sh 4 | 5 | .PHONY: dockerhub_push 6 | dockerhub_push: 7 | IMAGE_NAME=lyft/$$REPOSITORY REGISTRY=docker.io ./boilerplate/lyft/docker_build/docker_build.sh 8 | -------------------------------------------------------------------------------- /boilerplate/lyft/docker_build/Readme.rst: -------------------------------------------------------------------------------- 1 | Docker Build and Push 2 | ~~~~~~~~~~~~~~~~~~~~~ 3 | 4 | Provides a ``make docker_build`` target that builds your image locally. 5 | 6 | Provides a ``make dockerhub_push`` target that pushes your final image to Dockerhub. 7 | 8 | The Dockerhub image will tagged ``:`` 9 | 10 | If git head has a git tag, the Dockerhub image will also be tagged ``:``. 11 | 12 | **To Enable:** 13 | 14 | Add ``lyft/docker_build`` to your ``boilerplate/update.cfg`` file. 15 | 16 | Your Dockerfile **must** use docker's `multi-stage builds `_ and name the builder stage 'builder'. 17 | 18 | Add ``include boilerplate/lyft/docker_build/Makefile`` in your main ``Makefile`` _after_ your REPOSITORY environment variable 19 | 20 | :: 21 | 22 | REPOSITORY= 23 | include boilerplate/lyft/docker_build/Makefile 24 | 25 | (this ensures the extra Make targets get included in your main Makefile) 26 | -------------------------------------------------------------------------------- /boilerplate/lyft/docker_build/docker_build.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | echo "" 4 | echo "------------------------------------" 5 | echo " DOCKER BUILD" 6 | echo "------------------------------------" 7 | echo "" 8 | 9 | # If you have a special id_rsa file, you can pass it here. 10 | : ${RSA_FILE=~/.ssh/id_rsa} 11 | 12 | if [ -n "$REGISTRY" ]; then 13 | # Do not push if there are unstaged git changes 14 | CHANGED=$(git status --porcelain) 15 | if [ -n "$CHANGED" ]; then 16 | echo "Please commit git changes before pushing to a registry" 17 | exit 1 18 | fi 19 | fi 20 | 21 | 22 | GIT_SHA=$(git rev-parse HEAD) 23 | 24 | IMAGE_TAG_SUFFIX="" 25 | # for intermediate build phases, append -$BUILD_PHASE to all image tags 26 | if [ -n "$BUILD_PHASE" ]; then 27 | IMAGE_TAG_SUFFIX="-${BUILD_PHASE}" 28 | fi 29 | 30 | IMAGE_TAG_WITH_SHA="${IMAGE_NAME}:${GIT_SHA}${IMAGE_TAG_SUFFIX}" 31 | 32 | RELEASE_SEMVER=$(git describe --tags --exact-match "$GIT_SHA" 2>/dev/null) || true 33 | if [ -n "$RELEASE_SEMVER" ]; then 34 | IMAGE_TAG_WITH_SEMVER="${IMAGE_NAME}:${RELEASE_SEMVER}${IMAGE_TAG_SUFFIX}" 35 | fi 36 | 37 | # build the image 38 | docker build -t "$IMAGE_TAG_WITH_SHA" . 39 | echo "${IMAGE_TAG_WITH_SHA} built locally." 40 | 41 | # if REGISTRY specified, push the images to the remote registy 42 | if [ -n "$REGISTRY" ]; then 43 | 44 | if [ -n "${DOCKER_REGISTRY_PASSWORD}" ]; then 45 | docker login --username="$DOCKER_REGISTRY_USERNAME" --password="$DOCKER_REGISTRY_PASSWORD" 46 | fi 47 | 48 | docker tag "$IMAGE_TAG_WITH_SHA" "${REGISTRY}/${IMAGE_TAG_WITH_SHA}" 49 | 50 | docker push "${REGISTRY}/${IMAGE_TAG_WITH_SHA}" 51 | echo "${REGISTRY}/${IMAGE_TAG_WITH_SHA} pushed to remote." 52 | 53 | # If the current commit has a semver tag, also push the images with the semver tag 54 | if [ -n "$RELEASE_SEMVER" ]; then 55 | 56 | docker tag "$IMAGE_TAG_WITH_SHA" "${REGISTRY}/${IMAGE_TAG_WITH_SEMVER}" 57 | 58 | docker push "${REGISTRY}/${IMAGE_TAG_WITH_SEMVER}" 59 | echo "${REGISTRY}/${IMAGE_TAG_WITH_SEMVER} pushed to remote." 60 | 61 | fi 62 | fi 63 | -------------------------------------------------------------------------------- /boilerplate/lyft/golang_test_targets/81868GOPATH/bin/golangci-lint: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyft/flinkk8soperator/9eae44173ddf1dee9f8bbf3263100a4bc07a3049/boilerplate/lyft/golang_test_targets/81868GOPATH/bin/golangci-lint -------------------------------------------------------------------------------- /boilerplate/lyft/golang_test_targets/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: lint 2 | lint: #lints the package for common code smells 3 | which golangci-lint || sh boilerplate/lyft/golang_test_targets/golangci-lint.sh -b $$GOPATH/bin v1.51.2 4 | golangci-lint run --timeout 5m 5 | 6 | # If code is failing goimports linter, this will fix. 7 | # skips 'vendor' 8 | .PHONY: goimports 9 | goimports: 10 | @boilerplate/lyft/golang_test_targets/goimports 11 | 12 | .PHONY: test_unit 13 | test_unit: 14 | go test -cover ./... -race 15 | 16 | .PHONY: test_benchmark 17 | test_benchmark: 18 | go test -bench . ./... 19 | 20 | .PHONY: test_unit_cover 21 | test_unit_cover: 22 | go test ./... -coverprofile /tmp/cover.out -covermode=count; go tool cover -func /tmp/cover.out 23 | 24 | .PHONY: test_unit_visual 25 | test_unit_visual: 26 | go test ./... -coverprofile /tmp/cover.out -covermode=count; go tool cover -html=/tmp/cover.out 27 | -------------------------------------------------------------------------------- /boilerplate/lyft/golang_test_targets/Readme.rst: -------------------------------------------------------------------------------- 1 | Golang Test Targets 2 | ~~~~~~~~~~~~~~~~~~~ 3 | 4 | Provides an ``install`` make target that uses ``dep`` install golang dependencies. 5 | 6 | Provides a ``lint`` make target that uses golangci to lint your code. 7 | 8 | Provides a ``test_unit`` target for unit tests. 9 | 10 | Provides a ``test_unit_cover`` target for analysing coverage of unit tests, which will output the coverage of each function and total statement coverage. 11 | 12 | Provides a ``test_unit_visual`` target for visualizing coverage of unit tests through an interactive html code heat map. 13 | 14 | Provides a ``test_benchmark`` target for benchmark tests. 15 | 16 | **To Enable:** 17 | 18 | Add ``lyft/golang_test_targets`` to your ``boilerplate/update.cfg`` file. 19 | 20 | Make sure you're using ``dep`` for dependency management. 21 | 22 | Provide a ``.golangci`` configuration (the lint target requires it). 23 | 24 | Add ``include boilerplate/lyft/golang_test_targets/Makefile`` in your main ``Makefile`` _after_ your REPOSITORY environment variable 25 | 26 | :: 27 | 28 | REPOSITORY= 29 | include boilerplate/lyft/golang_test_targets/Makefile 30 | 31 | (this ensures the extra make targets get included in your main Makefile) 32 | -------------------------------------------------------------------------------- /boilerplate/lyft/golang_test_targets/goimports: -------------------------------------------------------------------------------- 1 | goimports -w $(find . -type f -name '*.go' -not -path "./vendor/*" -not -path "./pkg/client/*") 2 | -------------------------------------------------------------------------------- /cmd/flinkk8soperator/cmd/root.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "fmt" 7 | "os" 8 | "os/signal" 9 | "strings" 10 | "syscall" 11 | 12 | klog "k8s.io/klog/v2" 13 | 14 | "sigs.k8s.io/controller-runtime/pkg/cache" 15 | 16 | "github.com/lyft/flytestdlib/config/viper" 17 | "github.com/lyft/flytestdlib/version" 18 | 19 | "github.com/lyft/flytestdlib/config" 20 | "github.com/lyft/flytestdlib/logger" 21 | "github.com/spf13/pflag" 22 | 23 | "github.com/lyft/flinkk8soperator/pkg/controller/common" 24 | "github.com/spf13/cobra" 25 | 26 | "github.com/lyft/flinkk8soperator/pkg/controller" 27 | controllerConfig "github.com/lyft/flinkk8soperator/pkg/controller/config" 28 | ctrlRuntimeConfig "sigs.k8s.io/controller-runtime/pkg/client/config" 29 | 30 | apis "github.com/lyft/flinkk8soperator/pkg/apis/app" 31 | "github.com/lyft/flytestdlib/profutils" 32 | "github.com/lyft/flytestdlib/promutils" 33 | "github.com/lyft/flytestdlib/promutils/labeled" 34 | "github.com/pkg/errors" 35 | "sigs.k8s.io/controller-runtime/pkg/manager" 36 | ) 37 | 38 | var ( 39 | cfgFile string 40 | configAccessor = viper.NewAccessor(config.Options{}) 41 | ) 42 | 43 | // rootCmd represents the base command when called without any subcommands 44 | var rootCmd = &cobra.Command{ 45 | Use: "flinkoperator", 46 | Short: "Operator for running Flink applications in kubernetes", 47 | PreRunE: func(cmd *cobra.Command, args []string) error { 48 | return initConfig(cmd.Flags()) 49 | }, 50 | RunE: func(cmd *cobra.Command, args []string) error { 51 | return executeRootCmd(controllerConfig.GetConfig()) 52 | }, 53 | } 54 | 55 | // Execute adds all child commands to the root command and sets flags appropriately. 56 | // This is called by main.main(). It only needs to happen once to the rootCmd. 57 | func Execute() { 58 | version.LogBuildInformation(controllerConfig.AppName) 59 | if err := rootCmd.Execute(); err != nil { 60 | fmt.Println(err) 61 | os.Exit(1) 62 | } 63 | } 64 | 65 | func Run(config *controllerConfig.Config) error { 66 | if err := controllerConfig.SetConfig(config); err != nil { 67 | logger.Errorf(context.Background(), "Failed to set config: %v", err) 68 | return err 69 | } 70 | 71 | return executeRootCmd(controllerConfig.GetConfig()) 72 | } 73 | 74 | func init() { 75 | // See https://gist.github.com/nak3/78a32817a8a3950ae48f239a44cd3663 76 | // allows `$ flinkoperator --logtostderr` to work 77 | klog.InitFlags(nil) 78 | pflag.CommandLine.AddGoFlagSet(flag.CommandLine) 79 | err := flag.CommandLine.Parse([]string{}) 80 | if err != nil { 81 | logAndExit(err) 82 | } 83 | 84 | // Here you will define your flags and configuration settings. Cobra supports persistent flags, which, if defined 85 | // here, will be global for your application. 86 | rootCmd.PersistentFlags().StringVar(&cfgFile, "config", "", 87 | "config file path to load configuration") 88 | 89 | configAccessor.InitializePflags(rootCmd.PersistentFlags()) 90 | } 91 | 92 | func initConfig(flags *pflag.FlagSet) error { 93 | configAccessor = viper.NewAccessor(config.Options{ 94 | SearchPaths: []string{cfgFile}, 95 | }) 96 | 97 | configAccessor.InitializePflags(flags) 98 | err := configAccessor.UpdateConfig(context.Background()) 99 | if err != nil { 100 | return err 101 | } 102 | return nil 103 | } 104 | 105 | func logAndExit(err error) { 106 | logger.Error(context.Background(), err) 107 | os.Exit(-1) 108 | } 109 | 110 | func executeRootCmd(controllerCfg *controllerConfig.Config) error { 111 | ctx, cancelNow := context.WithCancel(context.Background()) 112 | 113 | labeled.SetMetricKeys(common.GetValidLabelNames()...) 114 | 115 | logger.Infof(ctx, "%+v\n", controllerCfg) 116 | 117 | if controllerCfg.MetricsPrefix == "" { 118 | logAndExit(errors.New("Invalid config: Metric prefix empty")) 119 | } 120 | operatorScope := promutils.NewScope(controllerCfg.MetricsPrefix) 121 | 122 | go func() { 123 | err := profutils.StartProfilingServerWithDefaultHandlers(ctx, controllerCfg.ProfilerPort.Port, nil) 124 | if err != nil { 125 | logger.Panicf(ctx, "Failed to Start profiling and metrics server. Error: %v", err) 126 | } 127 | }() 128 | 129 | if err := operatorEntryPoint(ctx, operatorScope, controllerCfg); err != nil { 130 | cancelNow() 131 | return err 132 | } 133 | 134 | <-ctx.Done() 135 | cancelNow() 136 | return nil 137 | } 138 | 139 | func operatorEntryPoint(ctx context.Context, metricsScope promutils.Scope, controllerCfg *controllerConfig.Config) error { 140 | // Get a config to talk to the apiserver 141 | cfg, err := ctrlRuntimeConfig.GetConfig() 142 | if err != nil { 143 | return err 144 | } 145 | 146 | limitNameSpace := strings.TrimSpace(controllerCfg.LimitNamespace) 147 | var mgr manager.Manager 148 | 149 | if limitNameSpace == "" { 150 | mgr, err = manager.New(cfg, manager.Options{ 151 | SyncPeriod: &controllerCfg.ResyncPeriod.Duration, 152 | }) 153 | } else { 154 | namespaceList := strings.Split(limitNameSpace, ",") 155 | mgr, err = manager.New(cfg, manager.Options{ 156 | NewCache: cache.MultiNamespacedCacheBuilder(namespaceList), 157 | SyncPeriod: &controllerCfg.ResyncPeriod.Duration, 158 | }) 159 | } 160 | 161 | if err != nil { 162 | return err 163 | } 164 | 165 | logger.Infof(ctx, "Registering Components.") 166 | 167 | // Setup Scheme for all resources 168 | if err := apis.AddToScheme(mgr.GetScheme()); err != nil { 169 | return err 170 | } 171 | 172 | // Setup all Controllers 173 | logger.Infof(ctx, "Adding controllers.") 174 | if err := controller.AddToManager(ctx, mgr, controllerConfig.RuntimeConfig{ 175 | MetricsScope: metricsScope, 176 | }); err != nil { 177 | return err 178 | } 179 | 180 | // Start the Cmd 181 | logger.Infof(ctx, "Starting the Cmd.") 182 | ctx, _ = signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM) 183 | return mgr.Start(ctx) 184 | } 185 | -------------------------------------------------------------------------------- /cmd/flinkk8soperator/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/lyft/flinkk8soperator/cmd/flinkk8soperator/cmd" 5 | ) 6 | 7 | func main() { 8 | cmd.Execute() 9 | } 10 | -------------------------------------------------------------------------------- /config/config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: flink.k8s.io/v1beta1 2 | kind: FlinkApplication 3 | projectName: flinkk8soperator 4 | -------------------------------------------------------------------------------- /config/test/flinkk8soperator_config.yaml: -------------------------------------------------------------------------------- 1 | # This is a default configuration file for test. 2 | # Real configuration when running inside K8s (local or otherwise) lives in a ConfigMap 3 | # The operator will replace "job" field with the correct flink job name 4 | operator: 5 | ingressUrlFormat: "{{$jobCluster}}.lyft.xyz" 6 | containerNameFormat: "%s-unknown" 7 | logger: 8 | show-source: true 9 | level: 4 10 | -------------------------------------------------------------------------------- /deploy/config.yaml: -------------------------------------------------------------------------------- 1 | kind: ConfigMap 2 | apiVersion: v1 3 | metadata: 4 | name: flink-operator-config 5 | namespace: flink-operator 6 | data: 7 | # this will need to be templatized 8 | config: |- 9 | operator: 10 | ingressUrlFormat: "{{$jobCluster}}.{ingress_suffix}" 11 | logger: 12 | level: 4 13 | -------------------------------------------------------------------------------- /deploy/flinkk8soperator.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: flinkoperator 5 | namespace: flink-operator 6 | labels: 7 | app: flinkoperator 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app: flinkoperator 13 | template: 14 | metadata: 15 | labels: 16 | app: flinkoperator 17 | app.kubernetes.io/version: 0.5.0 18 | spec: 19 | serviceAccountName: flinkoperator 20 | volumes: 21 | - name: config-volume 22 | configMap: 23 | name: flink-operator-config 24 | items: 25 | - key: config 26 | path: config.yaml 27 | containers: 28 | - name: flinkoperator-gojson 29 | image: docker.io/lyft/flinkk8soperator:v0.5.0 30 | command: 31 | - flinkoperator 32 | args: 33 | - --logtostderr 34 | - --config 35 | - /etc/flinkoperator/config*/config.yaml 36 | env: 37 | - name: OPERATOR_NAME 38 | value: flinkk8soperator 39 | imagePullPolicy: IfNotPresent 40 | ports: 41 | - containerPort: 10254 42 | resources: 43 | requests: 44 | memory: "4Gi" 45 | cpu: "4" 46 | limits: 47 | memory: "8G" 48 | cpu: "8" 49 | volumeMounts: 50 | - name: config-volume 51 | mountPath: /etc/flinkoperator/config 52 | -------------------------------------------------------------------------------- /deploy/flinkk8soperator_local.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: ConfigMap 3 | apiVersion: v1 4 | metadata: 5 | name: flink-operator-config 6 | data: 7 | # this will need to be templatized 8 | development: |- 9 | operator: 10 | containerNameFormat: "%s-unknown" 11 | --- 12 | 13 | # Create the actual deployment 14 | apiVersion: apps/v1 15 | kind: Deployment 16 | metadata: 17 | name: flinkk8soperator 18 | namespace: default 19 | labels: 20 | app: flinkk8soperator 21 | spec: 22 | replicas: 1 23 | selector: 24 | matchLabels: 25 | app: flinkk8soperator 26 | template: 27 | metadata: 28 | labels: 29 | app: flinkk8soperator 30 | app.kubernetes.io/name: flinkk8soperator 31 | app.kubernetes.io/version: 0.5.0 32 | spec: 33 | volumes: 34 | - name: config-volume 35 | configMap: 36 | name: flink-operator-config 37 | items: 38 | - key: development 39 | path: flinkk8soperator_config.yaml 40 | containers: 41 | - name: flinkk8soperator 42 | image: flinkk8soperator 43 | env: 44 | - name: OPERATOR_NAME 45 | value: flinkk8soperator 46 | imagePullPolicy: Never 47 | ports: 48 | - containerPort: 10254 49 | resources: 50 | requests: 51 | memory: "1Gi" 52 | cpu: "1" 53 | limits: 54 | memory: "8G" 55 | cpu: "8" 56 | volumeMounts: 57 | - name: config-volume 58 | mountPath: /etc/flinkk8soperator/config 59 | -------------------------------------------------------------------------------- /deploy/namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: flink-operator 5 | -------------------------------------------------------------------------------- /deploy/role-binding.yaml: -------------------------------------------------------------------------------- 1 | # Create a binding from Role -> ServiceAccount 2 | kind: ClusterRoleBinding 3 | apiVersion: rbac.authorization.k8s.io/v1 4 | metadata: 5 | name: flinkoperator 6 | roleRef: 7 | apiGroup: rbac.authorization.k8s.io 8 | kind: ClusterRole 9 | name: flinkoperator 10 | subjects: 11 | - kind: ServiceAccount 12 | name: flinkoperator 13 | namespace: flink-operator 14 | -------------------------------------------------------------------------------- /deploy/role.yaml: -------------------------------------------------------------------------------- 1 | # Create a ClusterRole for flinkk8soperator 2 | # https://kubernetes.io/docs/admin/authorization/rbac/ 3 | kind: ClusterRole 4 | apiVersion: rbac.authorization.k8s.io/v1 5 | metadata: 6 | name: flinkoperator 7 | rules: 8 | - apiGroups: 9 | - "" 10 | resources: 11 | - pods 12 | verbs: 13 | - get 14 | - list 15 | - watch 16 | - apiGroups: 17 | - "" 18 | resources: 19 | - services 20 | verbs: 21 | - create 22 | - get 23 | - list 24 | - watch 25 | - update 26 | - delete 27 | - apiGroups: 28 | - extensions 29 | - apps 30 | resources: 31 | - deployments 32 | - deployments/status 33 | - ingresses 34 | - ingresses/status 35 | verbs: 36 | - get 37 | - list 38 | - watch 39 | - create 40 | - update 41 | - delete 42 | # Allow Event recording access 43 | - apiGroups: 44 | - "" 45 | resources: 46 | - events 47 | verbs: 48 | - create 49 | - update 50 | - patch 51 | # Allow Access to CRD 52 | - apiGroups: 53 | - apiextensions.k8s.io 54 | resources: 55 | - customresourcedefinitions 56 | verbs: 57 | - get 58 | - list 59 | - watch 60 | - create 61 | - update 62 | # Allow Access to flink applications under flink.k8s.io 63 | - apiGroups: 64 | - flink.k8s.io 65 | resources: 66 | - flinkapplications 67 | - flinkapplications/status 68 | - flinkapplications/finalizers 69 | verbs: 70 | - get 71 | - list 72 | - watch 73 | - create 74 | - update 75 | - delete 76 | - patch 77 | --- 78 | # Create a Service Account for flinkk8soperator 79 | apiVersion: v1 80 | kind: ServiceAccount 81 | metadata: 82 | name: flinkoperator 83 | namespace: flink-operator 84 | -------------------------------------------------------------------------------- /docs/blue_green_state_machine.mmd: -------------------------------------------------------------------------------- 1 | %% This file can be compiled into blue_green_state_machine.png by installing mermaidjs (https://mermaidjs.github.io/) and running 2 | %% mmdc -i blue_green_state_machine.mmd -o blue_green_state_machine.png -w 1732 -b transparent 3 | 4 | graph LR 5 | New --> ClusterStarting 6 | 7 | subgraph RunningGroup [Running] 8 | Running 9 | DeployFailed 10 | end 11 | 12 | subgraph UpdatingGroup [Updating] 13 | Running --> Updating 14 | Updating --> ClusterStarting 15 | DeployFailed --> Updating 16 | 17 | ClusterStarting -- savepoint disabled --> SubmittingJob 18 | ClusterStarting -- savepoint enabled --> Savepointing 19 | ClusterStarting -- Create fails --> DeployFailed 20 | 21 | Savepointing --> SubmittingJob 22 | Savepointing -- Savepoint fails --> Recovering 23 | 24 | Recovering --> SubmittingJob 25 | Recovering -- No externalized checkpoint --> RollingBackJob 26 | 27 | SubmittingJob -- first deploy --> Running 28 | SubmittingJob -- updating existing application --> DualRunning 29 | SubmittingJob -- job start fails --> RollingBackJob 30 | RollingBackJob --> DeployFailed 31 | 32 | DualRunning -- tearDownVersionHash set --> Running 33 | DualRunning -- tear down fails --> DeployFailed 34 | end 35 | 36 | linkStyle 4 stroke:#303030 37 | linkStyle 5 stroke:#303030 38 | linkStyle 6 stroke:#FF0000 39 | linkStyle 8 stroke:#FF0000 40 | linkStyle 10 stroke:#FF0000 41 | linkStyle 11 stroke:#303030 42 | linkStyle 12 stroke:#303030 43 | linkStyle 13 stroke:#FF0000 44 | linkStyle 14 stroke:#FF0000 45 | linkStyle 15 stroke:#303030 46 | linkStyle 16 stroke:#FF0000 -------------------------------------------------------------------------------- /docs/blue_green_state_machine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyft/flinkk8soperator/9eae44173ddf1dee9f8bbf3263100a4bc07a3049/docs/blue_green_state_machine.png -------------------------------------------------------------------------------- /docs/dual_state_machine.mmd: -------------------------------------------------------------------------------- 1 | %% This file can be compiled into state_machine.png by installing mermaidjs (https://mermaidjs.github.io/) and running 2 | %% mmdc -i dual_state_machine.mmd -o dual_state_machine.png -w 1732 -b transparent 3 | 4 | graph LR 5 | New --> ClusterStarting 6 | 7 | subgraph RunningGroup [Running] 8 | Running 9 | DeployFailed 10 | end 11 | 12 | subgraph UpdatingGroup [Updating] 13 | Running -- scale mode default --> Updating 14 | Running -- scale mode InPlace --> Rescaling 15 | Updating --> ClusterStarting 16 | Rescaling -- savepoint disabled --> Cancelling 17 | Rescaling -- savepoint enabled --> Savepointing 18 | DeployFailed --> Updating 19 | 20 | ClusterStarting -- savepoint disabled --> Cancelling 21 | ClusterStarting -- savepoint enabled --> Savepointing 22 | ClusterStarting -- Create fails --> DeployFailed 23 | 24 | Cancelling --> SubmittingJob 25 | Cancelling -- cancel fails --> RollingBackJob 26 | Savepointing --> SubmittingJob 27 | Savepointing -- savepoint fails --> Recovering 28 | 29 | Recovering --> SubmittingJob 30 | Recovering -- No externalized checkpoint --> RollingBackJob 31 | 32 | SubmittingJob --> Running 33 | SubmittingJob -- job start fails --> RollingBackJob 34 | RollingBackJob --> DeployFailed 35 | end 36 | 37 | linkStyle 4 stroke:#303030 38 | linkStyle 5 stroke:#303030 39 | linkStyle 6 stroke:#FF0000 40 | linkStyle 8 stroke:#FF0000 41 | linkStyle 10 stroke:#FF0000 42 | linkStyle 12 stroke:#FF0000 43 | linkStyle 14 stroke:#FF0000 -------------------------------------------------------------------------------- /docs/dual_state_machine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyft/flinkk8soperator/9eae44173ddf1dee9f8bbf3263100a4bc07a3049/docs/dual_state_machine.png -------------------------------------------------------------------------------- /docs/local_dev.md: -------------------------------------------------------------------------------- 1 | # Flink Operator local development 2 | 3 | This guide will describe how to get set up for local development of 4 | the Flink Operator. This is most likely useful for people actually 5 | developing the operator, but may also be useful for developers looking 6 | to develop their applications locally. 7 | 8 | ## Run the operator 9 | 10 | ### Install [Minikube](https://minikube.sigs.k8s.io/docs/start/#what-youll-need) 11 | 12 | You will want to start minikube on >1.16 <=1.24, for example: 13 | `minikube start --kubernetes-version=v1.24.17` 14 | 15 | 16 | ### (Optional) Setup kubernetes dashboard 17 | 18 | This can be a handy complement to the CLI, especially for new users 19 | 20 | ```bash 21 | $ kubectl apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v1.10.0/src/deploy/recommended/kubernetes-dashboard.yaml 22 | $ kubectl proxy & 23 | $ open http://localhost:8001/api/v1/namespaces/kube-system/services/https:kubernetes-dashboard:/proxy/#!/overview 24 | ``` 25 | 26 | ### Set up your Go environment 27 | 28 | ```bash 29 | $ export GOPATH=~/src/go 30 | ``` 31 | 32 | (should probably go into your shell's profile) 33 | 34 | ### Checkout the code 35 | 36 | ```bash 37 | $ mkdir -p $GOPATH/src/github.com/lyft 38 | $ cd $GOPATH/src/github.com/lyft 39 | $ git clone git@github.com:lyft/flinkk8soperator.git 40 | ``` 41 | 42 | ### Install the custom resource definition 43 | 44 | ```bash 45 | $ cd flinkk8soperator 46 | $ kubectl create -f deploy/crd.yaml 47 | ``` 48 | 49 | ### Install permissions 50 | ``` bash 51 | $ kubectl create -f deploy/role.yaml 52 | $ kubectl create -f deploy/role-binding.yaml 53 | ``` 54 | 55 | ### Start the operator 56 | 57 | #### Option 1: run outside the kubernetes cluster 58 | 59 | In this mode, we run the operator locally (on our mac) or inside the 60 | IDE and configure it to talk to the docker-for-mac kubernetes 61 | cluster. This is very convinient for development, as we can iterate 62 | quickly, use a debugger, etc. 63 | 64 | ```bash 65 | $ go mod download 66 | $ KUBERNETES_CONFIG="$HOME/.kube/config" go run ./cmd/flinkk8soperator/main.go --config=local_config.yaml 67 | ``` 68 | 69 | #### Option 2: run inside the kubernetes cluster 70 | 71 | This mode more realistically emulates how the operator will run in 72 | production, however the turn-around time for changes is much longer. 73 | 74 | First we need to build the docker container for the operator: 75 | 76 | ```bash 77 | $ docker build -t flinkk8soperator . 78 | ``` 79 | 80 | Then create the operator cluster resources: 81 | 82 | ```bash 83 | $ kubectl create -f deploy/flinkk8soperator_local.yaml 84 | ``` 85 | 86 | ## Run an application 87 | 88 | ```bash 89 | $ kubectl create -f examples/wordcount/flink-operator-custom-resource.yaml 90 | ``` 91 | 92 | Now you should be able to see two pods (one for the jobmanager and one 93 | for the taskmanager) starting: 94 | 95 | ```bash 96 | $ kubectl get pods 97 | ``` 98 | 99 | You should also be able to access the jobmanager UI at: 100 | 101 | ```bash 102 | http://localhost:8001/api/v1/namespaces/default/services/{APP_NAME}-jm:8081/proxy/#/overview 103 | ``` 104 | 105 | (note you will need to be running `kubectl proxy` for this to work) 106 | 107 | You can tail the logs for the jobmanager (which may be useful for 108 | debugging failures) via: 109 | 110 | ```bash 111 | $ kubectl logs -f service/{APP_NAME}-jm 112 | ``` 113 | 114 | You can SSH into the jobmanager by running 115 | 116 | ```bash 117 | $ kubectl exec -it $(kubectl get pods -o=custom-columns=NAME:.metadata.name | grep "\-jm\-") -- /bin/bash 118 | ``` 119 | -------------------------------------------------------------------------------- /docs/quick-start-guide.md: -------------------------------------------------------------------------------- 1 | # Quick Start Guide 2 | 3 | If you are looking to develop and test the operator in your local machine, refer to [Local development guide](local_dev.md). 4 | 5 | Follow the steps below if you have a Kubernetes cluster up and running. 6 | 7 | ## Setup kubectl 8 | Follow the instructions [here](https://kubernetes.io/docs/tasks/tools/install-kubectl/) to install and setup kubectl 9 | 10 | ## Operator installation 11 | 12 | * Let's first create the custom resource definition, namespace, and roles for running the flink operator. 13 | 14 | ```bash 15 | $ kubectl create -f https://raw.githubusercontent.com/lyft/flinkk8soperator/v0.5.0/deploy/crd.yaml 16 | $ kubectl create -f https://raw.githubusercontent.com/lyft/flinkk8soperator/v0.5.0/deploy/namespace.yaml 17 | $ kubectl create -f https://raw.githubusercontent.com/lyft/flinkk8soperator/v0.5.0/deploy/role.yaml 18 | $ kubectl create -f https://raw.githubusercontent.com/lyft/flinkk8soperator/v0.5.0/deploy/role-binding.yaml 19 | ``` 20 | 21 | * Before creating the flink operator deployment, edit/update the operator config: 22 | 23 | ``` bash 24 | $ curl https://raw.githubusercontent.com/lyft/flinkk8soperator/v0.5.0/deploy/config.yaml 25 | ``` 26 | 27 | Replace the `{ingress_suffix}` to indicate your cluster's ingress url. 28 | 29 | ```yaml 30 | data: 31 | config: |- 32 | operator: 33 | ingressUrlFormat: "{{$jobCluster}}.{ingress_suffix}" 34 | logger: 35 | level: 4 36 | ``` 37 | 38 | Note: If `ingressUrlFormat` is not set, then no ingress is created for the application. 39 | 40 | Then create the ConfigMap on the cluster: 41 | ```bash 42 | $ kubectl create -f config.yaml 43 | ``` 44 | 45 | Finally, create the operator Deployment: 46 | ``` 47 | $ kubectl create -f https://raw.githubusercontent.com/lyft/flinkk8soperator/v0.5.0/deploy/flinkk8soperator.yaml 48 | ``` 49 | 50 | * Ensure that the flink operator pod is *RUNNING*, and check operator logs if needed. 51 | 52 | ```bash 53 | $ kubectl get pods -n flink-operator 54 | $ kubectl logs {pod-name} -n flink-operator 55 | ``` 56 | 57 | ## Running the example 58 | 59 | You can find a sample application to run with the flink operator [here](/examples/wordcount/). 60 | Make sure to edit the value of `sha` with the most recently pushed tag found [here](https://hub.docker.com/r/lyft/wordcount-operator-example/tags) 61 | ```yaml 62 | image: docker.io/lyft/wordcount-operator-example:{sha} 63 | ``` 64 | 65 | To run a flink application, run the following command: 66 | 67 | ```bash 68 | $ kubectl create -f https://raw.githubusercontent.com/lyft/flinkk8soperator/v0.5.0/examples/wordcount/flink-operator-custom-resource.yaml 69 | ``` 70 | 71 | The above command will create the flink application custom resource in kubernetes. The operator will observe the custom resource, and will create a flink cluster in kubernetes. 72 | 73 | Command below should show deployments created for the application 74 | ```bash 75 | $ kubectl get deployments -n flink-operator 76 | ``` 77 | 78 | Check the phase and other status attributes in the custom resource 79 | ```bash 80 | $ kubectl get flinkapplication.flink.k8s.io -n flink-operator wordcount-operator-example -o yaml 81 | ``` 82 | 83 | The output should be something like this 84 | ```yaml 85 | apiVersion: v1beta1 86 | kind: FlinkApplication 87 | metadata: 88 | clusterName: "" 89 | creationTimestamp: 2019-07-30T07:35:42Z 90 | finalizers: 91 | - job.finalizers.flink.k8s.io 92 | generation: 1 93 | labels: 94 | environment: development 95 | name: wordcount-operator-example 96 | namespace: flink-operator 97 | resourceVersion: "1025774" 98 | selfLink: v1beta1 99 | uid: a2855178-b29c-11e9-9a3b-025000000001 100 | spec: 101 | entryClass: org.apache.flink.WordCount 102 | flinkConfig: 103 | state.backend.fs.checkpointdir: file:///checkpoints/flink/checkpoints 104 | state.checkpoints.dir: file:///checkpoints/flink/externalized-checkpoints 105 | state.savepoints.dir: file:///checkpoints/flink/savepoints 106 | taskmanager.heap.size: 200 107 | flinkVersion: "1.8" 108 | image: docker.io/lyft/wordcount-operator-example:3b0347b2cdc1bda817e72b3099dac1c1b1363311 109 | jarName: wordcount-operator-example-1.0.0-SNAPSHOT.jar 110 | jobManagerConfig: 111 | envConfig: {} 112 | replicas: 1 113 | resources: 114 | requests: 115 | cpu: 200m 116 | memory: 200Mi 117 | parallelism: 3 118 | restartNonce: "" 119 | savepointInfo: {} 120 | taskManagerConfig: 121 | envConfig: {} 122 | resources: 123 | requests: 124 | cpu: 200m 125 | memory: 200Mi 126 | taskSlots: 2 127 | status: 128 | clusterStatus: 129 | availableTaskSlots: 4 130 | health: Green 131 | healthyTaskManagers: 2 132 | numberOfTaskManagers: 2 133 | numberOfTaskSlots: 4 134 | deployHash: d9f8a6a8 135 | failedDeployHash: "" 136 | jobStatus: 137 | completedCheckpointCount: 0 138 | entryClass: org.apache.flink.WordCount 139 | failedCheckpointCount: 0 140 | health: Green 141 | jarName: wordcount-operator-example-1.0.0-SNAPSHOT.jar 142 | jobID: acd232a002dd5204669d1041736b8fa0 143 | jobRestartCount: 0 144 | lastCheckpointTime: null 145 | lastFailingTime: null 146 | parallelism: 3 147 | restorePath: "" 148 | restoreTime: null 149 | startTime: 2019-07-30T07:35:59Z 150 | state: FINISHED 151 | lastSeenError: null 152 | lastUpdatedAt: 2019-07-30T07:36:09Z 153 | phase: Running 154 | retryCount: 0 155 | ``` 156 | 157 | To check events for the `FlinkApplication` object, run the following command: 158 | 159 | ```bash 160 | $ kubectl describe flinkapplication.flink.k8s.io -n flink-operator wordcount-operator-example 161 | ``` 162 | 163 | This will show events similarly to the following: 164 | 165 | ``` 166 | Events: 167 | Type Reason Age From Message 168 | ---- ------ ---- ---- ------- 169 | Normal CreatingCluster 4m flinkK8sOperator Creating Flink cluster for deploy d9f8a6a8 170 | Normal JobSubmitted 3m flinkK8sOperator Flink job submitted to cluster with id acd232a002dd5204669d1041736b8fa0 171 | ``` 172 | -------------------------------------------------------------------------------- /docs/state_machine.md: -------------------------------------------------------------------------------- 1 | # Flink operator state machine 2 | 3 | The core logic of the operator resides in the state machine. Various stages of the deployment lifecycle are mapped to 4 | discrete states. The operator continuously monitors the FlinkApplication custom resource. When it becomes out of sync 5 | with the underlying Kubernetes resources, it takes the necessary actions to update those resources to the desired state. 6 | Typically this will involve traversing the state machine. The final desired state is `Running`, which indicates that a 7 | healthy Flink cluster has been started and the Flink job has been successfully submitted. 8 | 9 | The state machine for a `Dual` deployment mode (default) looks like this: 10 | ![Flink operator state machine for Dual deployment mode](dual_state_machine.png) 11 | The state machine for a `BlueGreen` deployment mode looks like this: 12 | ![Flink operator state machine for BlueGreen deployment mode](blue_green_state_machine.png) 13 | # States 14 | 15 | ### New / Updating 16 | `New` (indicated in the resource by the empty string) is the initial state that all FlinkApplication resources start in. 17 | `Updating` is transitioned to when a change is made to an existing FlinkApplication. In both cases, a new cluster is 18 | created, and we transition to the ClusterStarting phase to monitor. The deployment objects created by the operator are 19 | labelled and annotated as indicated in the custom resource. The operator also sets the corresponding environment 20 | variables and arguments for the containers to start up the Flink application from the image. 21 | #### BlueGreen deployment mode 22 | Along with the annotations and labels in the custom resources, the deployment objects are suffixed with the application 23 | version name, that is either `blue` or `green`. The version name is also injected into the container environment. 24 | Additionally, the external URLs for each of the versions is also suffixed with the color. 25 | ### Rescaling 26 | If `scaleMode` is set to `InPlace`, an increase in parallelism will trigger a progression to `Rescaling` rather than 27 | `Updating`. In this mode, we increase the size the existing TaskManager deployment instead of creating a new one, after 28 | which we proceed with Cancelling or Savepointing depending on the `savepointDisabled` setting. 29 | #### BlueGreen deployment mode 30 | InPlace rescaling is not compatible with BlueGreen, so this state will not be reached in BlueGreen mode. 31 | ### ClusterStarting 32 | In this state, the operator monitors the Flink cluster created in the New state. Once it successfully starts, we check 33 | if the spec has `savepointDisabled` field set to true. If yes, we transition to `Cancelling` state else to `Savepointing`. 34 | If we are unable to start the cluster for some reason (an invalid 35 | image, bad configuration, not enough Kubernetes resources, etc.), we transition to the `DeployFailed` state. 36 | #### BlueGreen deployment mode 37 | In this mode, once the new cluster is started, we transition into the `Savepointing`/`SubmittingJob` mode based on the `savepointDisabled` 38 | flag. There is no job cancellation involved in the update process during a BlueGreen deployment. 39 | ### Cancelling 40 | In this state, the operator attempts to cancel the running job (if exists) and transition to `SubmittingJob` state. 41 | If it fails, we transition to `RollingBack`. 42 | #### BlueGreen deployment mode 43 | This state is not reached during a BlueGreen deployment. 44 | ### Savepointing 45 | In the `Savepointing` state, the operator attempts to cancel the existing job with a 46 | [savepoint](https://ci.apache.org/projects/flink/flink-docs-release-1.8/ops/state/savepoints.html) (if this is the first 47 | deploy for the FlinkApplication and there is no existing job, we transition straight to `SubmittingJob`). The operator 48 | monitors the savepoint process until it succeeds or fails. If savepointing succeeds, we move to the `SubmittingJob` 49 | phase. If it fails, we move to the `Recovering` phase to attempt recovering from an externalized checkpoint. 50 | #### BlueGreen deployment mode 51 | In this state, during a BlueGreen deployment, the currently running Flink job is savepointed (without cancellation). 52 | ### Recovering 53 | If savepointing fails, the operator will look for an 54 | [externalized checkpoint](https://ci.apache.org/projects/flink/flink-docs-release-1.8/ops/state/checkpoints.html#resuming-from-a-retained-checkpoint) 55 | and attempt to use that for recovery. If one is not availble, the application transitions to the `DeployFailed` state. 56 | Otherwise, it transitions to the `SubmittingJob` state. 57 | #### BlueGreen deployment mode 58 | There is no change in behavior for this state during a BlueGreen deployment. 59 | ### SubmittingJob 60 | In this state, the operator waits until the JobManager is ready, then attempts to submit the Flink job to the cluster. 61 | If we are updating an existing job or the user has specified a savepoint to restore from, that will be used. Once the 62 | job is successfully running the application transitions to the `Running` state. If the job submission fails we 63 | transition to the `RollingBack` state. 64 | #### BlueGreen deployment mode 65 | During a BlueGreen deployment, the operator submits a job to the newly created cluster (with a version that's different from the 66 | originally running Flink application version). 67 | ### RollingBack 68 | This state is reached when, in the middle of a deploy, the old job has been canceled but the new job did not come up 69 | successfully. In that case we will attempt to roll back by resubmitting the old job on the old cluster, after which 70 | we transition to the `DeployFailed` state. 71 | #### BlueGreen deployment mode 72 | In the BlueGreen deployment mode, the operator does not attempt to resubmit the old job (as we never cancel it in the first place). 73 | We transition directly to the `DeployFailed` state. 74 | ### Running 75 | The `Running` state indicates that the FlinkApplication custom resource has reached the desired state, and the job is 76 | running in the Flink cluster. In this state the operator continuously checks if the resource has been modified and 77 | monitors the health of the Flink cluster and job. 78 | #### BlueGreen deployment mode 79 | There is no change in behavior for this state during a BlueGreen deployment. 80 | ### DeployFailed 81 | The `DeployFailed` state operates exactly like the `Running` state. It exists to inform the user that an attempted 82 | update has failed, i.e., that the FlinkApplication status does not currently match the desired spec. In this state, 83 | the user should look at the Flink logs and Kubernetes events to determine what went wrong. The user can then perform 84 | a new deploy by updating the FlinkApplication. 85 | #### BlueGreen deployment mode 86 | There is no change in behavior for this state during a BlueGreen deployment. 87 | ### Deleting 88 | This state indicates that the FlinkApplication resource has been deleted. The operator will clean up the job according 89 | to the DeleteMode configured. Once all clean up steps have been performed the FlinkApplication will be deleted. 90 | #### BlueGreen deployment mode 91 | In this mode, if there are two application versions running, both versions are deleted (as per the `DeleteMode` configuration). 92 | ### DualRunning 93 | This state is only ever reached when the FlinkApplication is deployed with the BlueGreen deployment mode. In this state, 94 | there are two application versions running — `blue` and `green`. Once a user is ready to tear down one of the versions, they 95 | set a `tearDownVersionHash`. If this is set, the operator then tears down the application version corresponding to 96 | the `tearDownVersionHash`. Once the teardown is complete, we transition back to the `Running` state. 97 | -------------------------------------------------------------------------------- /docs/user_guide.md: -------------------------------------------------------------------------------- 1 | # User Guide 2 | 3 | For a quick introduction on how to build and install the Kubernetes Operator for Apache Flink, and how to run some sample applications, please refer to the [Quick Start Guide](quick-start-guide.md). For a complete reference of the custom resource definition of the `FlinkApplication`, please refer to the [API Specification](crd.md). 4 | 5 | ## Working with FlinkApplications 6 | 7 | ### Building a new Flink application 8 | The Flink operator brings up Jobmanager and Taskmanager for an application in Kubernetes. It does this by creating [deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) objects based on the *image* field in the FlinkApplication custom resource object. For more information about building images, please refer to this [documentation](/examples/README.md) and [examples](/examples/wordcount/). 9 | 10 | ### Creating a New FlinkApplication 11 | 12 | A `FlinkApplication` can be created from a YAML file storing the `FlinkApplication` specification using the `kubectl apply -f ` command. Once a `FlinkApplication` is successfully created, the operator will receive it and creates a Flink cluster as configured in the specification to run on the Kubernetes cluster. 13 | 14 | #### Mounting Volumes in the Flink Pods 15 | 16 | The Flink operator supports mounting user-defined [volumes](https://kubernetes.io/docs/concepts/storage/volumes/) in the Flink job manager and task manager pods. The volume can be of various types (e.g. configMap, secret, hostPath and persistentVolumeClaim). To specify the volume to be mounted, in the FlinkApp YAML, include `volumes` and `volumeMounts` under `spec` section. 17 | 18 | For example, the following YAML specifies a volume named `config-vol` populated by a ConfigMap named `dummy-cm`. The volume `config-vol` will be mounted at path `/opt/flink/mycm` in the job manager and task manager pods. 19 | 20 | ```yaml 21 | volumes: 22 | - name: config-vol 23 | configMap: 24 | name: dummy-cm 25 | volumeMounts: 26 | - name: config-vol 27 | mountPath: /opt/flink/mycm 28 | ``` 29 | 30 | ### Deleting a FlinkApplication 31 | 32 | A `FlinkApplication` can be deleted using the `kubectl delete ` command. Deleting a `Flinkapplication` deletes the Flink application custom resource and the Flink cluster associated with it. If the Flink job is running when the deletion happens, the Flink job is cancelled with savepoint before the cluster is deleted. 33 | 34 | ### Updating an existing FlinkApplication 35 | 36 | A `FlinkApplication` can be updated using the `kubectl apply -f ` command. When a `FlinkApplication` is successfully updated, the operator observes that the resource has changed. The operator before deleting the existing deployment, will cancel the Flink job with a savepoint. After the savepoint succeeds, the operator deletes the existing deployment and submits a new Flink job from the savepoint in the new Flink cluster. 37 | 38 | ### Checking a FlinkApplication 39 | 40 | A `FlinkApplication` can be checked using the `kubectl describe flinkapplication.flink.k8s.io ` command. The output of the command shows the specification and status of the `FlinkApplication` as well as events associated with it. 41 | 42 | ## Customizing the flink operator 43 | 44 | To customize the Flink operator, set/update these [configurations](https://github.com/lyft/flinkk8soperator/blob/master/pkg/controller/config/config.go). The values for config can be set either through a [ConfigMap](/deploy/config.yaml) or through command line. 45 | -------------------------------------------------------------------------------- /docs/who-is-using.md: -------------------------------------------------------------------------------- 1 | ## Who is using FlinkK8sOperator? 2 | 3 | | Organization | Contact (GitHub User Name) | Environment | Description of Use | 4 | | ------------- | ------------- | ------------- | ------------- | 5 | | Lyft | @anandswaminathan, @mwylde, @glaksh100 | Production | Streaming Platform | 6 | | CarTrack | @nasdin | Production | Data Engineering Infrastructure | 7 | | The Trade Desk | @JonnyIncognito | Production | Data Engineering Infrastructure | 8 | | Lightbend | @yuchaoran2011 | Production | Streaming Platform | 9 | | McAfee | @jmdacruz | Development | Streaming Platform | 10 | | Mux | @skidder | Production | Streaming Platform | 11 | | Zillow | @kelly-sm | Production | Streaming Platform | 12 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Application examples 2 | 3 | This contains examples showing applications that produce a docker image which is compatible with the Flink operator. Please use these examples as a reference while building applications to be be executed by the flink operator. 4 | 5 | * The Flink operator custom resource contains **image** field, and expects the image to have both flink and application code to be packaged in it. 6 | * The operator starts up Jobmanager and Taskmanager pods using [Container Args](https://godoc.org/k8s.io/api/core/v1#Container). 7 | * The operator submits the flink job through the [REST API](https://ci.apache.org/projects/flink/flink-docs-stable/monitoring/rest_api.html#jars-jarid-run) in the Jobmanager. For this to work, the jar file of the application needs to be present in the folder as indicated by the config value [web.upload.dir](https://ci.apache.org/projects/flink/flink-docs-stable/ops/config.html#web-upload-dir). 8 | * The operator injects flink configuration through [environment variables](https://github.com/lyft/flinkk8soperator/blob/master/pkg/controller/flink/container_utils.go#L84). 9 | * If there are issues in the **image** that causes either pods to restart, or Flink cluster to not respond to REST API requests, the state machine will not transition beyond the **READY** state. 10 | -------------------------------------------------------------------------------- /examples/beam-python/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM flink:1.16.2-scala_2.12-java11 AS flink 2 | FROM apachebeam/python3.6_sdk:2.17.0 3 | 4 | # Install dependencies 5 | RUN set -ex \ 6 | && apt-get update \ 7 | && apt-get -y install \ 8 | gettext-base \ 9 | openjdk-11-jre-headless \ 10 | openjdk-11-jdk-headless \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | # add Flink from the official Flink image 14 | ENV FLINK_HOME=/opt/flink 15 | ENV PATH=$PATH:$FLINK_HOME/bin 16 | COPY --from=flink $FLINK_HOME $FLINK_HOME 17 | 18 | # Install the job server, this will be the Flink entry point 19 | RUN \ 20 | mkdir -p /opt/flink/flink-web-upload \ 21 | && ( \ 22 | cd /opt/flink/flink-web-upload \ 23 | && curl -f -O https://repository.apache.org/content/groups/public/org/apache/beam/beam-runners-flink-1.16-job-server/2.49.0/beam-runners-flink-1.16-job-server-2.49.0.jar \ 24 | && ln -s beam-runners-flink-1.16-job-server*.jar beam-runner.jar \ 25 | ) \ 26 | && echo 'jobmanager.web.upload.dir: /opt/flink' >> $FLINK_HOME/conf/flink-conf.yaml 27 | 28 | # Application code - this can be moved to an s2i assemble script 29 | COPY . /code 30 | WORKDIR /code/src 31 | RUN \ 32 | pip install -r /code/src/requirements.txt 33 | 34 | # entry point for FlinkK8sOperator Flink config 35 | COPY docker-entrypoint.sh / 36 | 37 | ENTRYPOINT ["/docker-entrypoint.sh"] 38 | EXPOSE 6123 8081 39 | CMD ["local"] 40 | -------------------------------------------------------------------------------- /examples/beam-python/README.md: -------------------------------------------------------------------------------- 1 | # Beam Python Application example 2 | 3 | This example shows how to build a Docker image for a Beam Python application that is compatible with the Flink operator, from Flink and Beam base containers. 4 | 5 | The Python SDK workers run within the task manager container and the pipeline is submitted through the native Flink entry point (no Beam job server required). For more information about the Beam deployment see this [document](https://docs.google.com/document/d/1z3LNrRtr8kkiFHonZ5JJM_L4NWNBBNcqRc_yAf6G0VI/edit#heading=h.fh2f571kms4d). 6 | 7 | To deploy the example locally: `kubectl create -f flink-operator-custom-resource.yaml` 8 | 9 | Flink UI (after running `kubectl proxy`): `http://localhost:8001/api/v1/namespaces/flink-operator/services/beam-python-flinkk8soperator-example:8081/proxy/#/overview` 10 | -------------------------------------------------------------------------------- /examples/beam-python/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | drop_privs_cmd() { 4 | if [ $(id -u) != 0 ]; then 5 | # Don't need to drop privs if EUID != 0 6 | return 7 | elif [ -x /sbin/su-exec ]; then 8 | # Alpine 9 | echo su-exec 10 | else 11 | # Others 12 | #echo gosu flink 13 | echo "" 14 | fi 15 | } 16 | 17 | # Add in extra configs set by the operator 18 | if [ -n "$FLINK_PROPERTIES" ]; then 19 | echo "$FLINK_PROPERTIES" >> "$FLINK_HOME/conf/flink-conf.yaml" 20 | fi 21 | 22 | envsubst < $FLINK_HOME/conf/flink-conf.yaml > $FLINK_HOME/conf/flink-conf.yaml.tmp 23 | mv $FLINK_HOME/conf/flink-conf.yaml.tmp $FLINK_HOME/conf/flink-conf.yaml 24 | 25 | COMMAND=$@ 26 | 27 | if [ $# -lt 1 ]; then 28 | COMMAND="local" 29 | fi 30 | echo "COMMAND: $COMMAND" 31 | 32 | if [ "$COMMAND" = "help" ]; then 33 | echo "Usage: $(basename "$0") (jobmanager|taskmanager|local|help)" 34 | exit 0 35 | elif [ "$COMMAND" = "jobmanager" ]; then 36 | echo "Starting Job Manager" 37 | echo "config file: " && grep '^[^\n#]' "$FLINK_HOME/conf/flink-conf.yaml" 38 | exec $(drop_privs_cmd) "$FLINK_HOME/bin/jobmanager.sh" start-foreground 39 | elif [ "$COMMAND" = "taskmanager" ]; then 40 | echo "Starting Task Manager" 41 | echo "config file: " && grep '^[^\n#]' "$FLINK_HOME/conf/flink-conf.yaml" 42 | exec $(drop_privs_cmd) "$FLINK_HOME/bin/taskmanager.sh" start-foreground 43 | elif [ "$COMMAND" = "local" ]; then 44 | echo "Starting local cluster" 45 | exec $(drop_privs_cmd) "$FLINK_HOME/bin/jobmanager.sh" start-foreground local 46 | fi 47 | 48 | exec "$@" 49 | -------------------------------------------------------------------------------- /examples/beam-python/flink-operator-custom-resource.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: flink.k8s.io/v1beta1 2 | kind: FlinkApplication 3 | metadata: 4 | name: beam-python-flinkk8soperator-example 5 | namespace: flink-operator 6 | annotations: 7 | labels: 8 | environment: development 9 | spec: 10 | #image: docker.io/lyft/flinkk8soperator-example-beam:{sha} 11 | image: flinkk8soperator-example-beam 12 | flinkConfig: 13 | taskmanager.network.memory.fraction: 0.1 14 | taskmanager.network.memory.min: 10m 15 | state.backend.fs.checkpointdir: file:///checkpoints/flink/checkpoints 16 | state.checkpoints.dir: file:///checkpoints/flink/externalized-checkpoints 17 | state.savepoints.dir: file:///checkpoints/flink/savepoints 18 | jobManagerConfig: 19 | resources: 20 | requests: 21 | memory: "200Mi" 22 | cpu: "0.1" 23 | replicas: 1 24 | taskManagerConfig: 25 | taskSlots: 2 26 | resources: 27 | requests: 28 | memory: "200Mi" 29 | cpu: "0.1" 30 | flinkVersion: "1.16" 31 | jarName: "beam-runner.jar" 32 | parallelism: 1 33 | entryClass: "org.apache.beam.runners.flink.FlinkPortableClientEntryPoint" 34 | programArgs: "--driver-cmd \"cd /code/src; exec python -m beam_example.pipeline --job_name=beam-flinkk8soperator-example\"" 35 | deleteMode: None 36 | -------------------------------------------------------------------------------- /examples/beam-python/src/beam_example/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import -------------------------------------------------------------------------------- /examples/beam-python/src/beam_example/pipeline.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from __future__ import absolute_import 4 | 5 | import sys 6 | import apache_beam as beam 7 | from apache_beam.options.pipeline_options import PipelineOptions 8 | 9 | if __name__ == "__main__": 10 | # --job_endpoint argument supplied by the Flink entry point 11 | args = [ 12 | "--runner=PortableRunner", 13 | "--streaming", 14 | "--sdk_worker_parallelism=2", 15 | "--job_name=beam-on-flinkk8soperator", 16 | "--environment_type=PROCESS", 17 | "--environment_config={\"command\": \"/opt/apache/beam/boot\"}", 18 | ] 19 | # command line options override defaults 20 | args.extend(sys.argv[1:]) 21 | print("args: " + str(args)) 22 | pipeline = beam.Pipeline(options=PipelineOptions(args)) 23 | pcoll = (pipeline 24 | | beam.Create([0, 1, 2]) 25 | | beam.Map(lambda x: x)) 26 | result = pipeline.run() 27 | # streaming job does not finish 28 | #result.wait_until_finish() 29 | -------------------------------------------------------------------------------- /examples/beam-python/src/requirements.txt: -------------------------------------------------------------------------------- 1 | # Add your dependencies here 2 | # 3 | numpy==1.16.4 # via pyarrow 4 | apache-beam==2.17.0 5 | -------------------------------------------------------------------------------- /examples/wordcount/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM maven:3.9.3 as builder 2 | 3 | COPY src /usr/src/app/src 4 | COPY pom.xml /usr/src/app 5 | 6 | RUN mvn -f /usr/src/app/pom.xml clean package 7 | 8 | FROM flink:1.16.2-scala_2.12-java11 9 | 10 | COPY --from=builder /usr/src/app/target/ /code/target 11 | RUN ln -s /code/target $FLINK_HOME/flink-web-upload 12 | 13 | CMD ["help"] 14 | -------------------------------------------------------------------------------- /examples/wordcount/flink-operator-custom-resource.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: flink.k8s.io/v1beta1 2 | kind: FlinkApplication 3 | metadata: 4 | name: wordcount-operator-example 5 | namespace: flink-operator 6 | annotations: 7 | labels: 8 | environment: development 9 | spec: 10 | image: flink-wordcount 11 | deleteMode: None 12 | flinkConfig: 13 | state.backend.fs.checkpointdir: file:///checkpoints/flink/checkpoints 14 | state.checkpoints.dir: file:///checkpoints/flink/externalized-checkpoints 15 | state.savepoints.dir: file:///checkpoints/flink/savepoints 16 | web.upload.dir: /opt/flink 17 | jobManagerConfig: 18 | resources: 19 | requests: 20 | memory: "1500Mi" 21 | cpu: "0.1" 22 | replicas: 1 23 | taskManagerConfig: 24 | taskSlots: 3 25 | resources: 26 | requests: 27 | memory: "1500Mi" 28 | cpu: "0.1" 29 | flinkVersion: "1.16" 30 | jarName: "wordcount-operator-example-1.0.0-SNAPSHOT.jar" 31 | parallelism: 3 32 | entryClass: "org.apache.flink.WordCount" 33 | -------------------------------------------------------------------------------- /examples/wordcount/pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 4.0.0 6 | org.apache.flink 7 | wordcount-operator-example 8 | 1.0.0-SNAPSHOT 9 | jar 10 | 11 | 12 | 11 13 | 11 14 | 15 | 16 | WordCount 17 | 18 | 19 | 20 | org.apache.flink 21 | flink-java 22 | 1.16.2 23 | 24 | 25 | org.apache.flink 26 | flink-streaming-java 27 | 1.16.2 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /examples/wordcount/src/main/java/org/apache/flink/WordCount.java: -------------------------------------------------------------------------------- 1 | // https://github.com/apache/flink/blob/master/flink-examples/flink-examples-streaming/src/main/java/org/apache/flink/streaming/examples/wordcount/WordCount.java 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package org.apache.flink; 21 | 22 | import org.apache.flink.api.common.functions.FlatMapFunction; 23 | import org.apache.flink.api.java.tuple.Tuple2; 24 | import org.apache.flink.api.java.utils.ParameterTool; 25 | import org.apache.flink.streaming.api.datastream.DataStream; 26 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 27 | import org.apache.flink.util.WordCountData; 28 | import org.apache.flink.util.Collector; 29 | 30 | /** 31 | * Implements the "WordCount" program that computes a simple word occurrence 32 | * histogram over text files in a streaming fashion. 33 | * 34 | *

The input is a plain text file with lines separated by newline characters. 35 | * 36 | *

Usage: WordCount --input <path> --output <path>
37 | * If no parameters are provided, the program is run with default data from 38 | * {@link WordCountData}. 39 | * 40 | *

This example shows how to: 41 | *

    42 | *
  • write a simple Flink Streaming program, 43 | *
  • use tuple data types, 44 | *
  • write and use user-defined functions. 45 | *
46 | */ 47 | public class WordCount { 48 | 49 | // ************************************************************************* 50 | // PROGRAM 51 | // ************************************************************************* 52 | 53 | public static void main(String[] args) throws Exception { 54 | 55 | // Checking input parameters 56 | final ParameterTool params = ParameterTool.fromArgs(args); 57 | 58 | // set up the execution environment 59 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 60 | 61 | // make parameters available in the web interface 62 | env.getConfig().setGlobalJobParameters(params); 63 | 64 | // get input data 65 | DataStream text; 66 | if (params.has("input")) { 67 | // read the text file from given input path 68 | text = env.readTextFile(params.get("input")); 69 | } else { 70 | System.out.println("Executing WordCount example with default input data set."); 71 | System.out.println("Use --input to specify file input."); 72 | // get default test text data 73 | text = env.fromElements(WordCountData.WORDS); 74 | } 75 | 76 | DataStream> counts = 77 | // split up the lines in pairs (2-tuples) containing: (word,1) 78 | text.flatMap(new Tokenizer()) 79 | // group by the tuple field "0" and sum up tuple field "1" 80 | .keyBy(0).sum(1); 81 | 82 | // emit result 83 | if (params.has("output")) { 84 | counts.writeAsText(params.get("output")); 85 | } else { 86 | System.out.println("Printing result to stdout. Use --output to specify output path."); 87 | counts.print(); 88 | } 89 | 90 | // execute program 91 | env.execute("Streaming WordCount"); 92 | } 93 | 94 | // ************************************************************************* 95 | // USER FUNCTIONS 96 | // ************************************************************************* 97 | 98 | /** 99 | * Implements the string tokenizer that splits sentences into words as a 100 | * user-defined FlatMapFunction. The function takes a line (String) and 101 | * splits it into multiple pairs in the form of "(word,1)" ({@code Tuple2}). 103 | */ 104 | public static final class Tokenizer implements FlatMapFunction> { 105 | 106 | @Override 107 | public void flatMap(String value, Collector> out) { 108 | // normalize and split the line 109 | String[] tokens = value.toLowerCase().split("\\W+"); 110 | 111 | // emit the pairs 112 | for (String token : tokens) { 113 | if (token.length() > 0) { 114 | out.collect(new Tuple2<>(token, 1)); 115 | } 116 | } 117 | } 118 | } 119 | 120 | } 121 | -------------------------------------------------------------------------------- /examples/wordcount/src/main/java/org/apache/flink/util/WordCountData.java: -------------------------------------------------------------------------------- 1 | // Copied from https://github.com/apache/flink/blob/master/flink-examples/flink-examples-batch/src/main/java/org/apache/flink/examples/java/wordcount/util/WordCountData.java 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | package org.apache.flink.util; 22 | 23 | import org.apache.flink.api.java.DataSet; 24 | import org.apache.flink.api.java.ExecutionEnvironment; 25 | 26 | /** 27 | * Provides the default data sets used for the WordCount example program. 28 | * The default data sets are used, if no parameters are given to the program. 29 | * 30 | */ 31 | public class WordCountData { 32 | 33 | public static final String[] WORDS = new String[] { 34 | "To be, or not to be,--that is the question:--", 35 | "Whether 'tis nobler in the mind to suffer", 36 | "The slings and arrows of outrageous fortune", 37 | "Or to take arms against a sea of troubles,", 38 | "And by opposing end them?--To die,--to sleep,--", 39 | "No more; and by a sleep to say we end", 40 | "The heartache, and the thousand natural shocks", 41 | "That flesh is heir to,--'tis a consummation", 42 | "Devoutly to be wish'd. To die,--to sleep;--", 43 | "To sleep! perchance to dream:--ay, there's the rub;", 44 | "For in that sleep of death what dreams may come,", 45 | "When we have shuffled off this mortal coil,", 46 | "Must give us pause: there's the respect", 47 | "That makes calamity of so long life;", 48 | "For who would bear the whips and scorns of time,", 49 | "The oppressor's wrong, the proud man's contumely,", 50 | "The pangs of despis'd love, the law's delay,", 51 | "The insolence of office, and the spurns", 52 | "That patient merit of the unworthy takes,", 53 | "When he himself might his quietus make", 54 | "With a bare bodkin? who would these fardels bear,", 55 | "To grunt and sweat under a weary life,", 56 | "But that the dread of something after death,--", 57 | "The undiscover'd country, from whose bourn", 58 | "No traveller returns,--puzzles the will,", 59 | "And makes us rather bear those ills we have", 60 | "Than fly to others that we know not of?", 61 | "Thus conscience does make cowards of us all;", 62 | "And thus the native hue of resolution", 63 | "Is sicklied o'er with the pale cast of thought;", 64 | "And enterprises of great pith and moment,", 65 | "With this regard, their currents turn awry,", 66 | "And lose the name of action.--Soft you now!", 67 | "The fair Ophelia!--Nymph, in thy orisons", 68 | "Be all my sins remember'd." 69 | }; 70 | 71 | public static DataSet getDefaultTextLineDataSet(ExecutionEnvironment env) { 72 | return env.fromElements(WORDS); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/lyft/flinkk8soperator 2 | 3 | go 1.20 4 | 5 | require ( 6 | github.com/benlaurie/objecthash v0.0.0-20180202135721-d1e3d6079fc1 7 | github.com/hashicorp/go-version v1.2.1 8 | github.com/jarcoal/httpmock v1.0.4 9 | github.com/lyft/flytestdlib v0.2.10 10 | github.com/mitchellh/mapstructure v1.4.2 11 | github.com/pkg/errors v0.9.1 12 | github.com/spf13/cobra v1.8.0 13 | github.com/spf13/pflag v1.0.5 14 | github.com/stretchr/testify v1.8.0 15 | go.uber.org/zap v1.24.0 16 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c 17 | gopkg.in/resty.v1 v1.12.0 18 | sigs.k8s.io/controller-runtime v0.11.2 19 | ) 20 | 21 | require ( 22 | k8s.io/api v0.24.2 23 | k8s.io/apiextensions-apiserver v0.23.5 24 | k8s.io/apimachinery v0.24.2 25 | k8s.io/client-go v0.24.2 26 | k8s.io/code-generator v0.23.5 27 | k8s.io/klog/v2 v2.60.1 28 | ) 29 | 30 | require ( 31 | github.com/PuerkitoBio/purell v1.1.1 // indirect 32 | github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect 33 | github.com/benbjohnson/clock v1.1.0 // indirect 34 | github.com/beorn7/perks v1.0.1 // indirect 35 | github.com/cespare/xxhash/v2 v2.2.0 // indirect 36 | github.com/cilium/ebpf v0.12.2 // indirect 37 | github.com/cosiner/argv v0.1.0 // indirect 38 | github.com/cpuguy83/go-md2man/v2 v2.0.3 // indirect 39 | github.com/davecgh/go-spew v1.1.1 // indirect 40 | github.com/derekparker/trie v0.0.0-20230829180723-39f4de51ef7d // indirect 41 | github.com/emicklei/go-restful v2.9.5+incompatible // indirect 42 | github.com/evanphx/json-patch v4.12.0+incompatible // indirect 43 | github.com/fatih/color v1.7.0 // indirect 44 | github.com/fsnotify/fsnotify v1.6.0 // indirect 45 | github.com/go-delve/delve v1.21.2 // indirect 46 | github.com/go-delve/liner v1.2.3-0.20220127212407-d32d89dd2a5d // indirect 47 | github.com/go-logr/logr v1.2.0 // indirect 48 | github.com/go-openapi/jsonpointer v0.19.5 // indirect 49 | github.com/go-openapi/jsonreference v0.19.5 // indirect 50 | github.com/go-openapi/swag v0.19.14 // indirect 51 | github.com/gogo/protobuf v1.3.2 // indirect 52 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect 53 | github.com/golang/protobuf v1.5.3 // indirect 54 | github.com/google/gnostic v0.5.7-v3refs // indirect 55 | github.com/google/go-cmp v0.5.9 // indirect 56 | github.com/google/go-dap v0.11.0 // indirect 57 | github.com/google/gofuzz v1.1.0 // indirect 58 | github.com/google/uuid v1.1.2 // indirect 59 | github.com/hashicorp/golang-lru v1.0.2 // indirect 60 | github.com/hashicorp/hcl v1.0.0 // indirect 61 | github.com/imdario/mergo v0.3.12 // indirect 62 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 63 | github.com/josharian/intern v1.0.0 // indirect 64 | github.com/json-iterator/go v1.1.12 // indirect 65 | github.com/kr/pretty v0.3.1 // indirect 66 | github.com/kr/text v0.2.0 // indirect 67 | github.com/magiconair/properties v1.8.5 // indirect 68 | github.com/mailru/easyjson v0.7.6 // indirect 69 | github.com/mattn/go-colorable v0.1.13 // indirect 70 | github.com/mattn/go-isatty v0.0.20 // indirect 71 | github.com/mattn/go-runewidth v0.0.15 // indirect 72 | github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect 73 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 74 | github.com/modern-go/reflect2 v1.0.2 // indirect 75 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 76 | github.com/pelletier/go-toml v1.9.3 // indirect 77 | github.com/pmezard/go-difflib v1.0.0 // indirect 78 | github.com/prometheus/client_golang v1.15.1 // indirect 79 | github.com/prometheus/client_model v0.4.0 // indirect 80 | github.com/prometheus/common v0.42.0 // indirect 81 | github.com/prometheus/procfs v0.9.0 // indirect 82 | github.com/rivo/uniseg v0.4.4 // indirect 83 | github.com/rogpeppe/go-internal v1.10.0 // indirect 84 | github.com/russross/blackfriday/v2 v2.1.0 // indirect 85 | github.com/sirupsen/logrus v1.9.3 // indirect 86 | github.com/spf13/afero v1.6.0 // indirect 87 | github.com/spf13/cast v1.3.1 // indirect 88 | github.com/spf13/jwalterweatherman v1.1.0 // indirect 89 | github.com/spf13/viper v1.8.1 // indirect 90 | github.com/subosito/gotenv v1.2.0 // indirect 91 | go.starlark.net v0.0.0-20231101134539-556fd59b42f6 // indirect 92 | go.uber.org/atomic v1.7.0 // indirect 93 | go.uber.org/multierr v1.6.0 // indirect 94 | golang.org/x/arch v0.6.0 // indirect 95 | golang.org/x/exp v0.0.0-20231006140011-7918f672742d // indirect 96 | golang.org/x/mod v0.13.0 // indirect 97 | golang.org/x/net v0.16.0 // indirect 98 | golang.org/x/oauth2 v0.8.0 // indirect 99 | golang.org/x/sys v0.14.0 // indirect 100 | golang.org/x/term v0.13.0 // indirect 101 | golang.org/x/text v0.13.0 // indirect 102 | golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect 103 | golang.org/x/tools v0.14.0 // indirect 104 | gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect 105 | google.golang.org/appengine v1.6.7 // indirect 106 | google.golang.org/protobuf v1.30.0 // indirect 107 | gopkg.in/inf.v0 v0.9.1 // indirect 108 | gopkg.in/ini.v1 v1.62.0 // indirect 109 | gopkg.in/yaml.v2 v2.4.0 // indirect 110 | gopkg.in/yaml.v3 v3.0.1 // indirect 111 | k8s.io/component-base v0.24.2 // indirect 112 | k8s.io/gengo v0.0.0-20211129171323-c02415ce4185 // indirect 113 | k8s.io/kube-openapi v0.0.0-20220328201542-3ee0da9b0b42 // indirect 114 | k8s.io/utils v0.0.0-20220210201930-3a6ce19ff2f9 // indirect 115 | sigs.k8s.io/json v0.0.0-20211208200746-9f7c6b3444d2 // indirect 116 | sigs.k8s.io/structured-merge-diff/v4 v4.2.1 // indirect 117 | sigs.k8s.io/yaml v1.3.0 // indirect 118 | ) 119 | -------------------------------------------------------------------------------- /integ/README.md: -------------------------------------------------------------------------------- 1 | # Integration Tests 2 | 3 | This directory contains integration tests for the operator. These 4 | tests involve running the operator against a real Kubernetes system to 5 | validate its behavior. 6 | 7 | ## Running the integration tests 8 | 9 | You will need a few things to run these tests. Firstly, you will need 10 | a Kubernetes cluster and a kubeconfig file to talk to it. The easiest 11 | way to get this is probably to install Docker for Mac (if on Mac) or 12 | Minikube/microk8s on Linux. You will also need `kube proxy` running on 13 | port 8001. 14 | 15 | The tests can run in two modes: direct and image. In direct mode, the 16 | operator is run from the current source code from within the test. In 17 | image mode the operator is submitted to Kubernetes as a deployment and 18 | run from there. 19 | 20 | By default the tests create, use, and clean up the namespace 21 | `flinkoperatortest`. 22 | 23 | These tests use a sample Flink job [operator-test-app](/integ/operator-test-app/). The 24 | tests currently use two images built before the integration test is run. 25 | 26 | ### Setup 27 | 28 | These tests create and mount a directory located at `/tmp/checkpoints` 29 | into containers. You may need to configure this directory as a bind 30 | mount. The tests also need to create this directory with 31 | world-writable permissions. On linux this may require that you 32 | run `umask 000` before running the tests. 33 | 34 | ``` 35 | $ kubectl proxy & 36 | $ go mod download 37 | ``` 38 | 39 | ### Running in Direct mode 40 | 41 | (from within this directory) 42 | 43 | ``` 44 | $ INTEGRATION=true RUN_DIRECT=true go test 45 | ``` 46 | 47 | ### Running in Image mode 48 | 49 | ``` 50 | $ INTEGRATION=true OPERATOR_IMAGE={operator image} go test 51 | ``` 52 | 53 | Note that you will need to either build an image with tag flinkk8soperator:latest or specify the operator image using the 54 | `OPERATOR_IMAGE` environment 55 | 56 | ### Options 57 | 58 | The behavior of the tests are controlled via environment 59 | variables. Supported options include: 60 | 61 | * `INTEGRATION` If not set, all integration tests will be skipped 62 | * `KUBERNETES_CONFIG` Should point to your Kubernetes config file 63 | (defaults to `~/.kube/config`) 64 | * `NAMESPACE` The namespace to use for all Kubernetes resources 65 | created by the tests. If set to default, the test framework will not 66 | create or delete the namespace. 67 | * `RUN_DIRECT` If set, will run the operator directly; otherwise will 68 | submit run it via a deployment inside Kubernetes 69 | * `OPERATOR_IMAGE` The image to use for the operator when running in image 70 | mode. By default, `lyft/flinkk8soperator:latest` 71 | 72 | You can also pass [gocheck](http://labix.org/gocheck) options to the 73 | test runner. Particularly useful is `-check.vv` which will output logs 74 | from the operator and Flink pods to help debugging test failures. 75 | 76 | ### Minikube Setup 77 | 78 | Ideally we'd use k8s 1.16 to match the deployed k8s version, however, this 79 | is non-trivial due to cgroup configurations. Instead, we will use a version 80 | that is compatible with v1beta1 CRD's which corresponds to <1.22. CRD's v1 81 | is only available with client >=1.16, however, the client used here is 1.14 82 | and the upgrade is non-trivial. 83 | TODO: https://jira.lyft.net/browse/STRMCMP-1659 84 | 85 | Ran on: 86 | - Go 1.12 87 | - Docker desktop 4.5.0 88 | - Minikube v1.29.0 (running 1.20.15) 89 | - i9 Ventura 13.2.1 90 | - GoLand 2021.3.3 91 | 92 | 93 | 1. Install Dependencies 94 | Run `go mod vendor` 95 | 96 | 2. Start minikube 97 | `minikube start --kubernetes-version=v1.24.17` 98 | 99 | 3. Set up test app images and operator image 100 | `integ/setup.sh` 101 | 102 | 4. Set the following for the Go test: 103 | Package path: `github.com/lyft/flinkk8soperator/integ` 104 | Env: `INTEGRATION=true;OPERATOR_IMAGE=flinkk8soperator:local;RUN_DIRECT=true` 105 | Program Args: `-timeout 40m -check.vv IntegTest` 106 | 107 | 108 | Helpers: 109 | - Kill kube proxy 110 | `ps -ef | grep "kubectl proxy"` 111 | `kill -9 ` 112 | - Kill stuck flink app 113 | `kubectl patch FlinkApplication invalidcanceljob -p '{"metadata":{"finalizers":[]}}' --type=merge` 114 | - Set default namespace 115 | `kubectl config set-context --current --namespace=flinkoperatortest` 116 | -------------------------------------------------------------------------------- /integ/blue_green_deployment_test.go: -------------------------------------------------------------------------------- 1 | package integ 2 | 3 | import ( 4 | "context" 5 | "time" 6 | 7 | "github.com/lyft/flinkk8soperator/integ/log" 8 | "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 9 | . "gopkg.in/check.v1" 10 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 11 | ) 12 | 13 | func WaitForUpdate(c *C, s *IntegSuite, name string, updateFn func(app *v1beta1.FlinkApplication), phase v1beta1.FlinkApplicationPhase, failurePhase v1beta1.FlinkApplicationPhase) *v1beta1.FlinkApplication { 14 | // update with new image. 15 | app, err := s.Util.Update(name, updateFn) 16 | c.Assert(err, IsNil) 17 | 18 | for { 19 | // keep trying until the new job is launched 20 | newApp, err := s.Util.GetFlinkApplication(name) 21 | c.Assert(err, IsNil) 22 | if newApp.Status.VersionStatuses[s.Util.GetCurrentStatusIndex(app)].JobStatus.JobID != "" { 23 | break 24 | } 25 | time.Sleep(100 * time.Millisecond) 26 | } 27 | 28 | c.Assert(s.Util.WaitForPhase(name, phase, failurePhase), IsNil) 29 | c.Assert(s.Util.WaitForAllTasksRunning(name), IsNil) 30 | 31 | newApp, _ := s.Util.GetFlinkApplication(name) 32 | return newApp 33 | } 34 | 35 | func (s *IntegSuite) TestUpdateWithBlueGreenDeploymentMode(c *C) { 36 | log.Info("Starting test TestUpdateWithBlueGreenDeploymentMode") 37 | 38 | testName := "bluegreenupdate" 39 | const finalizer = "bluegreen.finalizers.test.com" 40 | 41 | // start a simple app 42 | config, err := s.Util.ReadFlinkApplication("test_app.yaml") 43 | c.Assert(err, IsNil, Commentf("Failed to read test app yaml")) 44 | 45 | config.Name = testName + "job" 46 | config.Spec.DeploymentMode = v1beta1.DeploymentModeBlueGreen 47 | config.ObjectMeta.Labels["integTest"] = testName 48 | config.Finalizers = append(config.Finalizers, finalizer) 49 | 50 | c.Assert(s.Util.CreateFlinkApplication(config), IsNil, 51 | Commentf("Failed to create flink application")) 52 | 53 | c.Assert(s.Util.WaitForPhase(config.Name, v1beta1.FlinkApplicationRunning, v1beta1.FlinkApplicationDeployFailed), IsNil) 54 | c.Assert(s.Util.WaitForAllTasksRunning(config.Name), IsNil) 55 | 56 | pods, err := s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). 57 | List(context.Background(), v1.ListOptions{LabelSelector: "integTest=" + testName}) 58 | c.Assert(err, IsNil) 59 | c.Assert(len(pods.Items), Equals, 2) 60 | for _, pod := range pods.Items { 61 | c.Assert(pod.Spec.Containers[0].Image, Equals, config.Spec.Image) 62 | } 63 | 64 | // test updating the app with a new image 65 | newApp := WaitForUpdate(c, s, config.Name, func(app *v1beta1.FlinkApplication) { 66 | app.Spec.Image = NewImage 67 | }, v1beta1.FlinkApplicationDualRunning, v1beta1.FlinkApplicationDeployFailed) 68 | 69 | c.Assert(newApp.Spec.Image, Equals, NewImage) 70 | c.Assert(newApp.Status.SavepointPath, NotNil) 71 | 72 | pods, err = s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). 73 | List(context.Background(), v1.ListOptions{LabelSelector: "integTest=" + testName}) 74 | c.Assert(err, IsNil) 75 | // We have 2 applications running 76 | c.Assert(len(pods.Items), Equals, 4) 77 | c.Assert(s.Util.WaitForPhase(config.Name, v1beta1.FlinkApplicationDualRunning, v1beta1.FlinkApplicationDeployFailed), IsNil) 78 | c.Assert(s.Util.GetJobID(newApp), NotNil) 79 | c.Assert(newApp.Status.UpdatingVersion, Equals, v1beta1.BlueFlinkApplication) 80 | c.Assert(newApp.Status.DeployVersion, Equals, v1beta1.GreenFlinkApplication) 81 | 82 | // TearDownVersionHash 83 | teardownVersion := newApp.Status.DeployVersion 84 | hashToTeardown := newApp.Status.DeployHash 85 | oldHash := newApp.Status.DeployHash 86 | log.Infof("Tearing down version %s", teardownVersion) 87 | newApp = WaitForUpdate(c, s, config.Name, func(app *v1beta1.FlinkApplication) { 88 | app.Spec.TearDownVersionHash = hashToTeardown 89 | }, v1beta1.FlinkApplicationRunning, v1beta1.FlinkApplicationDeployFailed) 90 | 91 | // wait for the old cluster to be cleaned up 92 | for { 93 | pods, err := s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name).List(context.Background(), v1.ListOptions{}) 94 | c.Assert(err, IsNil) 95 | 96 | oldPodFound := false 97 | 98 | for _, pod := range pods.Items { 99 | if pod.Annotations["flink-app-hash"] == oldHash { 100 | oldPodFound = true 101 | } 102 | } 103 | 104 | if !oldPodFound { 105 | break 106 | } 107 | 108 | time.Sleep(100 * time.Millisecond) 109 | } 110 | 111 | c.Assert(s.Util.WaitForPhase(config.Name, v1beta1.FlinkApplicationRunning, v1beta1.FlinkApplicationDeployFailed), IsNil) 112 | c.Assert(newApp.Status.TeardownHash, NotNil) 113 | c.Assert(newApp.Status.DeployVersion, Equals, v1beta1.BlueFlinkApplication) 114 | c.Assert(newApp.Status.VersionStatuses[0].JobStatus.JobID, NotNil) 115 | c.Assert(newApp.Status.VersionStatuses[1].JobStatus, Equals, v1beta1.FlinkJobStatus{}) 116 | 117 | pods, err = s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). 118 | List(context.Background(), v1.ListOptions{LabelSelector: "flink-app-hash=" + oldHash}) 119 | for _, pod := range pods.Items { 120 | log.Infof("Pod name %s", pod.Name) 121 | c.Assert(pod.Labels["flink-application-version"], Not(Equals), teardownVersion) 122 | } 123 | 124 | c.Assert(err, IsNil) 125 | c.Assert(len(pods.Items), Equals, 0) 126 | 127 | // cleanup 128 | c.Assert(s.Util.FlinkApps().Delete(context.Background(), newApp.Name, v1.DeleteOptions{}), IsNil) 129 | var app *v1beta1.FlinkApplication 130 | for { 131 | app, err = s.Util.GetFlinkApplication(config.Name) 132 | c.Assert(err, IsNil) 133 | if len(app.Finalizers) == 1 && app.Finalizers[0] == finalizer { 134 | break 135 | } 136 | time.Sleep(100 * time.Millisecond) 137 | } 138 | 139 | job := s.Util.GetJobOverview(app) 140 | c.Assert(job["status"], Equals, "CANCELED") 141 | c.Assert(app.Status.SavepointPath, NotNil) 142 | 143 | // delete our finalizer 144 | app.Finalizers = []string{} 145 | _, err = s.Util.FlinkApps().Update(context.Background(), app, v1.UpdateOptions{}) 146 | c.Assert(err, IsNil) 147 | 148 | for { 149 | pods, err := s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). 150 | List(context.Background(), v1.ListOptions{LabelSelector: "integTest=" + testName}) 151 | c.Assert(err, IsNil) 152 | if len(pods.Items) == 0 { 153 | break 154 | } 155 | } 156 | log.Info("All pods torn down") 157 | log.Info("Completed test TestUpdateWithBlueGreenDeploymentMode") 158 | } 159 | -------------------------------------------------------------------------------- /integ/checkpoint_failure_test.go: -------------------------------------------------------------------------------- 1 | package integ 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | 8 | "github.com/lyft/flinkk8soperator/integ/log" 9 | "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 10 | . "gopkg.in/check.v1" 11 | coreV1 "k8s.io/api/core/v1" 12 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 13 | ) 14 | 15 | func failingJobTest(s *IntegSuite, c *C, testName string, causeFailure func()) { 16 | // create a Flink app 17 | config, err := s.Util.ReadFlinkApplication("test_app.yaml") 18 | c.Assert(err, IsNil, Commentf("Failed to read test app yaml")) 19 | config.Name = testName + "job" 20 | config.Spec.DeleteMode = v1beta1.DeleteModeForceCancel 21 | 22 | config.ObjectMeta.Labels["integTest"] = testName 23 | 24 | // Cause it to fail 25 | causeFailure() 26 | 27 | c.Assert(s.Util.CreateFlinkApplication(config), IsNil, 28 | Commentf("Failed to create flink application")) 29 | 30 | c.Assert(s.Util.WaitForPhase(config.Name, v1beta1.FlinkApplicationRunning, v1beta1.FlinkApplicationDeployFailed), IsNil) 31 | 32 | // wait a bit for it to start failing 33 | time.Sleep(5 * time.Second) 34 | 35 | // Try to update it 36 | app, err := s.Util.GetFlinkApplication(config.Name) 37 | c.Assert(err, IsNil) 38 | app.Spec.Image = NewImage 39 | _, err = s.Util.FlinkApps().Update(context.Background(), app, v1.UpdateOptions{}) 40 | c.Assert(err, IsNil) 41 | 42 | // because the checkpoint will fail, the app should move to deploy failed 43 | c.Assert(s.Util.WaitForPhase(config.Name, v1beta1.FlinkApplicationDeployFailed), IsNil) 44 | 45 | // And the job should not have been updated 46 | newApp, err := s.Util.GetFlinkApplication(config.Name) 47 | c.Assert(err, IsNil) 48 | c.Assert(newApp.Status.JobStatus.JobID, Equals, app.Status.JobStatus.JobID) 49 | 50 | endpoint := fmt.Sprintf("jobs/%s", app.Status.JobStatus.JobID) 51 | _, err = s.Util.FlinkAPIGet(app, endpoint) 52 | c.Assert(err, IsNil) 53 | 54 | // delete the application and ensure everything is cleaned up successfully 55 | c.Assert(s.Util.FlinkApps().Delete(context.Background(), app.Name, v1.DeleteOptions{}), IsNil) 56 | 57 | for { 58 | pods, err := s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). 59 | List(context.Background(), v1.ListOptions{LabelSelector: "integTest=" + testName}) 60 | c.Assert(err, IsNil) 61 | if len(pods.Items) == 0 { 62 | break 63 | } 64 | } 65 | log.Info("All pods torn down") 66 | } 67 | 68 | // Tests that we correctly handle updating a job with a checkpoint timeout 69 | func (s *IntegSuite) TestCheckpointTimeout(c *C) { 70 | log.Info("Starting test TestCheckpointTimeout") 71 | 72 | failingJobTest(s, c, "checkpointtimeout", func() { 73 | // cause checkpoints to take 120 seconds 74 | err := s.Util.ExecuteCommand("minikube", "ssh", "echo 120000 >> /tmp/checkpoints/checkpoint_delay && sudo chmod 0644 /tmp/checkpoints/checkpoint_delay") 75 | c.Assert(err, IsNil) 76 | }) 77 | log.Info("Completed test TestCheckpointTimeout") 78 | } 79 | 80 | func appUpdate(app *v1beta1.FlinkApplication) *v1beta1.FlinkApplication { 81 | app.Spec.Image = NewImage 82 | skipFailureEnvVar := coreV1.EnvVar{Name: "SKIP_INDUCED_FAILURE", Value: "true"} 83 | app.Spec.JobManagerConfig.EnvConfig.Env = append(app.Spec.JobManagerConfig.EnvConfig.Env, skipFailureEnvVar) 84 | app.Spec.TaskManagerConfig.EnvConfig.Env = append(app.Spec.TaskManagerConfig.EnvConfig.Env, skipFailureEnvVar) 85 | var maxCheckpointRestoreAgeSeconds int32 = 1 86 | app.Spec.MaxCheckpointRestoreAgeSeconds = &maxCheckpointRestoreAgeSeconds 87 | return app 88 | } 89 | 90 | func failingTaskTest(s *IntegSuite, c *C, testName string, fallbackWithoutState bool, deployShouldFail bool, causeFailure func()) { 91 | config, err := s.Util.ReadFlinkApplication("test_app.yaml") 92 | c.Assert(err, IsNil, Commentf("Failed to read test app yaml")) 93 | config.Name = testName + "job" 94 | config.Spec.DeleteMode = v1beta1.DeleteModeForceCancel 95 | config.Spec.FallbackWithoutState = fallbackWithoutState 96 | config.ObjectMeta.Labels["integTest"] = testName 97 | 98 | // Avoid external checkpoints to be used in recovery stage during update 99 | err = s.Util.ExecuteCommand("minikube", "ssh", "echo 120000 >> /tmp/checkpoints/checkpoint_delay && sudo chmod 0644 /tmp/checkpoints/checkpoint_delay") 100 | c.Assert(err, IsNil) 101 | 102 | c.Assert(s.Util.CreateFlinkApplication(config), IsNil, 103 | Commentf("Failed to create flink application")) 104 | 105 | c.Assert(s.Util.WaitForPhase(config.Name, v1beta1.FlinkApplicationRunning, v1beta1.FlinkApplicationDeployFailed), IsNil) 106 | 107 | // Cause it to fail 108 | causeFailure() 109 | 110 | // wait a bit for it to start failing 111 | time.Sleep(5 * time.Second) 112 | 113 | // get app details 114 | app, err := s.Util.GetFlinkApplication(config.Name) 115 | c.Assert(err, IsNil) 116 | 117 | if deployShouldFail { 118 | // Try to update it 119 | app, err := s.Util.GetFlinkApplication(config.Name) 120 | c.Assert(err, IsNil) 121 | app = appUpdate(app) 122 | _, err = s.Util.FlinkApps().Update(context.Background(), app, v1.UpdateOptions{}) 123 | c.Assert(err, IsNil) 124 | 125 | // because the checkpoint will fail, the app should move to deploy failed 126 | c.Assert(s.Util.WaitForPhase(config.Name, v1beta1.FlinkApplicationDeployFailed), IsNil) 127 | 128 | // And the job should not have been updated 129 | newApp, err := s.Util.GetFlinkApplication(config.Name) 130 | c.Assert(err, IsNil) 131 | c.Assert(newApp.Status.JobStatus.JobID, Equals, app.Status.JobStatus.JobID) 132 | } else { 133 | // Try to update it with app that does not fail on checkpoint 134 | newApp := WaitUpdateAndValidate(c, s, config.Name, func(app *v1beta1.FlinkApplication) { 135 | appUpdate(app) 136 | }, v1beta1.FlinkApplicationDeployFailed) 137 | 138 | // Check job updated and started without savepointPath 139 | c.Assert(newApp.Status.JobStatus.JobID, Not(Equals), app.Status.JobStatus.JobID) 140 | c.Assert(newApp.Spec.SavepointPath, Equals, "") 141 | 142 | // Check new app has no failures 143 | endpoint := fmt.Sprintf("jobs/%s", newApp.Status.JobStatus.JobID) 144 | _, err = s.Util.FlinkAPIGet(newApp, endpoint) 145 | c.Assert(err, IsNil) 146 | } 147 | 148 | // delete the application and ensure everything is cleaned up successfully 149 | c.Assert(s.Util.FlinkApps().Delete(context.Background(), app.Name, v1.DeleteOptions{}), IsNil) 150 | 151 | for { 152 | pods, err := s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). 153 | List(context.Background(), v1.ListOptions{LabelSelector: "integTest=" + testName}) 154 | c.Assert(err, IsNil) 155 | if len(pods.Items) == 0 { 156 | break 157 | } 158 | } 159 | log.Info("All pods torn down") 160 | } 161 | 162 | // Tests that we correctly handle updating a job with task failures 163 | func (s *IntegSuite) TestJobWithTaskFailures(c *C) { 164 | log.Info("Starting test TestJobWithTaskFailures") 165 | failingTaskTest(s, c, "taskfailure", false, true, func() { 166 | err := s.Util.ExecuteCommand("minikube", "ssh", "touch /tmp/checkpoints/fail && chmod 0644 /tmp/checkpoints/fail") 167 | c.Assert(err, IsNil) 168 | }) 169 | log.Info("Completed test TestJobWithTaskFailures") 170 | } 171 | 172 | func (s *IntegSuite) TestSavepointCheckpointFailureFallback(c *C) { 173 | log.Info("Starting test TestSavepointCheckpointFailureFallback") 174 | failingTaskTest(s, c, "recoveryfallback", true, false, func() { 175 | err := s.Util.ExecuteCommand("minikube", "ssh", "touch /tmp/checkpoints/fail && chmod 0644 /tmp/checkpoints/fail") 176 | c.Assert(err, IsNil) 177 | }) 178 | log.Info("Completed test TestSavepointCheckpointFailureFallback") 179 | } 180 | -------------------------------------------------------------------------------- /integ/install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -e 4 | 5 | curl -LO -s https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 6 | sudo install minikube-linux-amd64 /usr/local/bin/minikube 7 | 8 | minikube config set memory 6800 9 | minikube start --kubernetes-version=v1.24.17 10 | 11 | go mod download 12 | -------------------------------------------------------------------------------- /integ/log/log.go: -------------------------------------------------------------------------------- 1 | package log 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "net/http" 7 | "strconv" 8 | "strings" 9 | 10 | "go.uber.org/zap" 11 | "go.uber.org/zap/zapcore" 12 | "go.uber.org/zap/zaptest" 13 | klog "k8s.io/klog/v2" 14 | ) 15 | 16 | var ( 17 | log Logger 18 | atom zap.AtomicLevel 19 | ) 20 | 21 | type Fields map[string]interface{} 22 | 23 | type Logger struct { 24 | *zap.SugaredLogger 25 | } 26 | 27 | func (l Logger) Print(v ...interface{}) { 28 | l.WithOptions(zap.AddCallerSkip(1)).Info(v...) 29 | } 30 | 31 | func (l Logger) Printf(format string, v ...interface{}) { 32 | l.WithOptions(zap.AddCallerSkip(1)).Infof(format, v...) 33 | } 34 | 35 | func (l Logger) Println(v ...interface{}) { 36 | msg := fmt.Sprintln(v...) 37 | l.WithOptions(zap.AddCallerSkip(1)).Info(msg[:len(msg)-1]) 38 | } 39 | 40 | func (l Logger) Warning(v ...interface{}) { 41 | l.WithOptions(zap.AddCallerSkip(1)).Warn(v...) 42 | } 43 | 44 | func (l Logger) Warningf(format string, v ...interface{}) { 45 | l.WithOptions(zap.AddCallerSkip(1)).Warnf(format, v...) 46 | } 47 | 48 | func stringifyLargeFields(v []interface{}) { 49 | for i, field := range v { 50 | switch val := field.(type) { 51 | case int64: 52 | v[i] = strconv.FormatInt(val, 10) 53 | case uint64: 54 | v[i] = strconv.FormatUint(val, 10) 55 | } 56 | } 57 | } 58 | 59 | func (l Logger) With(v ...interface{}) Logger { 60 | if len(v) == 0 { 61 | return Logger{SugaredLogger: l.SugaredLogger} 62 | } 63 | stringifyLargeFields(v) 64 | return Logger{SugaredLogger: l.SugaredLogger.With(v...)} 65 | } 66 | 67 | func (l Logger) WithOptions(v ...zap.Option) Logger { 68 | if len(v) == 0 { 69 | return Logger{SugaredLogger: l.SugaredLogger} 70 | } 71 | return Logger{SugaredLogger: l.SugaredLogger.Desugar().WithOptions(v...).Sugar()} 72 | } 73 | 74 | func Info(v ...interface{}) { 75 | log.WithOptions(zap.AddCallerSkip(2)).Info(v...) 76 | } 77 | 78 | func Infof(format string, v ...interface{}) { 79 | log.WithOptions(zap.AddCallerSkip(2)).Infof(format, v...) 80 | } 81 | 82 | func Warn(v ...interface{}) { 83 | log.WithOptions(zap.AddCallerSkip(2)).Warn(v...) 84 | } 85 | 86 | func Warnf(format string, v ...interface{}) { 87 | log.WithOptions(zap.AddCallerSkip(2)).Warnf(format, v...) 88 | } 89 | 90 | func Error(v ...interface{}) { 91 | log.WithOptions(zap.AddCallerSkip(2)).Error(v...) 92 | } 93 | 94 | func Errorf(format string, v ...interface{}) { 95 | log.WithOptions(zap.AddCallerSkip(2)).Errorf(format, v...) 96 | } 97 | 98 | func Print(v ...interface{}) { 99 | log.WithOptions(zap.AddCallerSkip(2)).Print(v...) 100 | } 101 | 102 | func Printf(format string, v ...interface{}) { 103 | log.WithOptions(zap.AddCallerSkip(2)).Printf(format, v...) 104 | } 105 | 106 | func Println(v ...interface{}) { 107 | log.WithOptions(zap.AddCallerSkip(2)).Println(v...) 108 | } 109 | 110 | func Fatal(v ...interface{}) { 111 | log.WithOptions(zap.AddCallerSkip(2)).Fatal(v...) 112 | } 113 | 114 | func Fatalf(format string, v ...interface{}) { 115 | log.WithOptions(zap.AddCallerSkip(2)).Fatalf(format, v...) 116 | } 117 | 118 | func Fatalln(v ...interface{}) { 119 | msg := fmt.Sprintln(v...) 120 | log.WithOptions(zap.AddCallerSkip(2)).Fatal(msg) 121 | } 122 | 123 | func Panic(v ...interface{}) { 124 | log.WithOptions(zap.AddCallerSkip(2)).Panic(v...) 125 | } 126 | 127 | func Panicf(format string, v ...interface{}) { 128 | log.WithOptions(zap.AddCallerSkip(2)).Panicf(format, v...) 129 | } 130 | 131 | func Panicln(v ...interface{}) { 132 | msg := fmt.Sprintln(v...) 133 | log.WithOptions(zap.AddCallerSkip(2)).Panic(msg[:len(msg)-1]) 134 | } 135 | 136 | func NewEntry() Logger { 137 | return log 138 | } 139 | 140 | const timeKey = "time" 141 | 142 | func EnableProductionLogging(lvl zapcore.Level) { 143 | atom.SetLevel(lvl) 144 | encCfg := zap.NewProductionEncoderConfig() 145 | encCfg.EncodeTime = zapcore.RFC3339NanoTimeEncoder 146 | encCfg.TimeKey = timeKey 147 | cfg := zap.Config{ 148 | Level: atom, 149 | Encoding: "json", 150 | EncoderConfig: encCfg, 151 | OutputPaths: []string{"stderr"}, 152 | ErrorOutputPaths: []string{"stderr"}, 153 | } 154 | newLog, _ := cfg.Build() 155 | log.SugaredLogger = newLog.Sugar() 156 | } 157 | 158 | func NewErrorOnlyLogger() Logger { 159 | encCfg := zap.NewProductionEncoderConfig() 160 | encCfg.EncodeTime = zapcore.RFC3339NanoTimeEncoder 161 | encCfg.TimeKey = timeKey 162 | cfg := zap.Config{ 163 | Level: zap.NewAtomicLevelAt(zapcore.ErrorLevel), 164 | Encoding: "json", 165 | EncoderConfig: encCfg, 166 | OutputPaths: []string{"stderr"}, 167 | ErrorOutputPaths: []string{"stderr"}, 168 | } 169 | l, _ := cfg.Build() 170 | return Logger{SugaredLogger: l.Sugar()} 171 | } 172 | 173 | func NewTestLogger(t zaptest.TestingT) Logger { 174 | return Logger{SugaredLogger: zaptest.NewLogger(t).Sugar()} 175 | } 176 | 177 | func Level() zapcore.Level { 178 | return atom.Level() 179 | } 180 | 181 | func LevelHandler() http.Handler { 182 | return atom 183 | } 184 | 185 | func init() { 186 | cfg := zap.NewDevelopmentConfig() 187 | atom = cfg.Level 188 | l, _ := cfg.Build() 189 | log = Logger{l.Sugar()} 190 | 191 | fs := flag.NewFlagSet("klog", flag.ExitOnError) 192 | klog.InitFlags(fs) 193 | defer klog.Flush() 194 | _ = fs.Set("logtostderr", "false") // Default is "true". 195 | _ = fs.Set("skip_headers", "true") // Skip headers with ts, etc added by klog. 196 | klog.SetOutputBySeverity("INFO", &klogWrapper{fn: log.Info}) 197 | klog.SetOutputBySeverity("WARNING", &klogWrapper{fn: log.Warn}) 198 | klog.SetOutputBySeverity("ERROR", &klogWrapper{fn: log.Error}) 199 | klog.SetOutputBySeverity("FATAL", &klogWrapper{fn: log.Fatal}) 200 | } 201 | 202 | type klogWrapper struct { 203 | fn func(...interface{}) 204 | } 205 | 206 | func (w *klogWrapper) Write(p []byte) (n int, err error) { 207 | w.fn(strings.TrimSuffix(string(p), "\n")) 208 | return len(p), nil 209 | } 210 | -------------------------------------------------------------------------------- /integ/main_test.go: -------------------------------------------------------------------------------- 1 | package integ 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | "testing" 9 | "time" 10 | 11 | "github.com/lyft/flinkk8soperator/cmd/flinkk8soperator/cmd" 12 | "github.com/lyft/flinkk8soperator/integ/log" 13 | integFramework "github.com/lyft/flinkk8soperator/integ/utils" 14 | controllerConfig "github.com/lyft/flinkk8soperator/pkg/controller/config" 15 | flyteConfig "github.com/lyft/flytestdlib/config" 16 | . "gopkg.in/check.v1" 17 | k8sErrors "k8s.io/apimachinery/pkg/api/errors" 18 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 19 | "k8s.io/client-go/util/homedir" 20 | ) 21 | 22 | type IntegSuite struct { 23 | Util *integFramework.TestUtil 24 | } 25 | 26 | var _ = Suite(&IntegSuite{}) 27 | 28 | func Test(t *testing.T) { 29 | // flag.Parse() 30 | TestingT(t) 31 | } 32 | 33 | func (s *IntegSuite) SetUpSuite(c *C) { 34 | // var namespace = flag.String("namespace", "flinkoperatortest", "namespace to use for testing") 35 | var namespace = os.Getenv("NAMESPACE") 36 | if namespace == "" { 37 | namespace = "flinkoperatortest" 38 | } 39 | // var runDirect = flag.Bool("runDirect", false, "if set, runs the operator from the current source instead of from an image") 40 | var runDirect = os.Getenv("RUN_DIRECT") != "" 41 | // var image = flag.String("operatorImage", "flinkk8soperator:latest", "image for the operator") 42 | var image = os.Getenv("OPERATOR_IMAGE") 43 | if image == "" { 44 | image = "flinkk8soperator:latest" 45 | } 46 | //var integration = flag.Bool("integration", false, "run integration tests") 47 | var integration = os.Getenv("INTEGRATION") != "" 48 | 49 | if !integration { 50 | // skip integration tests unless --integration is passed 51 | c.Skip("--integration not provided") 52 | return 53 | } 54 | 55 | kubeconfig := os.Getenv("KUBERNETES_CONFIG") 56 | fmt.Printf("Kube config: %s", kubeconfig) 57 | if kubeconfig == "" { 58 | kubeconfig = filepath.Join(homedir.HomeDir(), ".kube", "config") 59 | err := os.Setenv("KUBERNETES_CONFIG", kubeconfig) 60 | if err != nil { 61 | c.Fatalf("Failed to set KUBERNETES_CONFIG env") 62 | } 63 | } 64 | 65 | checkpointDir := os.Getenv("CHECKPOINT_DIR") 66 | if checkpointDir == "" { 67 | checkpointDir = "/tmp/checkpoints" 68 | } 69 | 70 | var err error 71 | s.Util, err = integFramework.New(namespace, kubeconfig, image, checkpointDir) 72 | if err != nil { 73 | c.Fatalf("Failed to set up test util: %v", err) 74 | } 75 | 76 | if err = s.Util.CreateCRD(); err != nil && !k8sErrors.IsAlreadyExists(err) { 77 | c.Fatalf("Failed to create CRD: %v", err) 78 | } 79 | 80 | if runDirect { 81 | config := controllerConfig.Config{ 82 | LimitNamespace: namespace, 83 | UseProxy: true, 84 | ResyncPeriod: flyteConfig.Duration{Duration: 3 * time.Second}, 85 | MaxErrDuration: flyteConfig.Duration{Duration: 6000 * time.Second}, 86 | FlinkJobVertexTimeout: flyteConfig.Duration{Duration: 3 * time.Minute}, 87 | MetricsPrefix: "flinkk8soperator", 88 | ProxyPort: flyteConfig.Port{Port: 8001}, 89 | } 90 | 91 | log.Info("Running operator directly") 92 | 93 | go func() { 94 | if err = cmd.Run(&config); err != nil { 95 | c.Fatalf("Failed to run operator: %v", err) 96 | } 97 | }() 98 | } else { 99 | if err = s.Util.CreateClusterRole(); err != nil && !k8sErrors.IsAlreadyExists(err) { 100 | c.Fatalf("Failed to create role: %v", err) 101 | } 102 | 103 | if err = s.Util.CreateServiceAccount(); err != nil && !k8sErrors.IsAlreadyExists(err) { 104 | c.Fatalf("Failed to create service account: %v", err) 105 | } 106 | 107 | if err = s.Util.CreateClusterRoleBinding(); err != nil && !k8sErrors.IsAlreadyExists(err) { 108 | c.Fatalf("Failed to create cluster role binding: %v", err) 109 | } 110 | 111 | if err = s.Util.CreateOperator(); err != nil { 112 | c.Fatalf("Failed to create operator: %v", err) 113 | } 114 | 115 | if err = s.Util.TailOperatorLogs(); err != nil { 116 | c.Fatalf("Failed to tail operator logs: %v", err) 117 | } 118 | } 119 | } 120 | 121 | func (s *IntegSuite) TearDownSuite(c *C) { 122 | if s != nil && s.Util != nil { 123 | log.Info("Cleaning up") 124 | s.Util.Cleanup() 125 | } 126 | } 127 | 128 | func (s *IntegSuite) SetUpTest(c *C) { 129 | // create checkpoint directory 130 | if err := s.Util.ExecuteCommand("minikube", "ssh", "sudo rm -rf /tmp/checkpoints"); err != nil { 131 | c.Fatalf("Failed to clean up checkpoint directory: %v", err) 132 | } 133 | if err := s.Util.ExecuteCommand("minikube", "ssh", "sudo mkdir /tmp/checkpoints && sudo chmod -R 0777 /tmp/checkpoints"); err != nil { 134 | c.Fatalf("Failed to create checkpoint directory: %v", err) 135 | } 136 | } 137 | 138 | func (s *IntegSuite) TearDownTest(c *C) { 139 | tms, err := s.Util.GetTaskManagerPods() 140 | if err == nil { 141 | for i, tm := range tms { 142 | fmt.Printf("\n\n######### TaskManager %d logs for debugging "+ 143 | "#########\n---------------------------\n", i) 144 | _ = s.Util.GetLogs(tm, nil) 145 | } 146 | } 147 | 148 | jm, err := s.Util.GetJobManagerPod() 149 | if err == nil { 150 | fmt.Printf("\n\n######### JobManager logs for debugging #########\n---------------------------\n") 151 | _ = s.Util.GetLogs(jm, nil) 152 | } 153 | 154 | fmt.Printf("\n\n######### Nodes for debugging #########\n---------------------------\n") 155 | err = s.Util.ExecuteCommand("kubectl", "describe", "nodes") 156 | c.Assert(err, IsNil) 157 | 158 | fmt.Printf("\n\n######### Pods for debugging #########\n---------------------------\n") 159 | err = s.Util.ExecuteCommand("kubectl", "get", "pods", "-n", "flinkoperatortest") 160 | c.Assert(err, IsNil) 161 | 162 | fmt.Printf("\n\n######### Pod details for debugging #########\n---------------------------\n") 163 | err = s.Util.ExecuteCommand("kubectl", "describe", "pods", "-n", "flinkoperatortest") 164 | c.Assert(err, IsNil) 165 | 166 | fmt.Printf("\n\n######### Flink Applications for debugging #########\n---------------------------\n") 167 | err = s.Util.ExecuteCommand("kubectl", "describe", "flinkapplications", "-n", "flinkoperatortest") 168 | c.Assert(err, IsNil) 169 | 170 | deleteOpts := &v1.DeleteOptions{} 171 | err = s.Util.FlinkApps().DeleteCollection(context.Background(), *deleteOpts, v1.ListOptions{}) 172 | if err != nil { 173 | log.Fatalf("Failed to clean up flink applications: %v", err) 174 | } 175 | 176 | if err := s.Util.ExecuteCommand("minikube", "ssh", "sudo rm -rf /tmp/checkpoints"); err != nil { 177 | c.Fatalf("Failed to delete checkpoint directory: %v", err) 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /integ/operator-test-app/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8-jdk 2 | 3 | # Prepare environment 4 | ENV FLINK_HOME=/opt/flink 5 | ENV MAVEN_HOME=/opt/maven 6 | ENV HADOOP_HOME=/opt/hadoop 7 | ENV PATH=$FLINK_HOME/bin:$HADOOP_HOME/bin:$MAVEN_HOME/bin:$PATH 8 | 9 | COPY . /code 10 | 11 | # Configure Flink version 12 | ENV FLINK_VERSION=1.8.1 \ 13 | HADOOP_SCALA_VARIANT=scala_2.12 14 | 15 | # Install dependencies 16 | RUN set -ex; \ 17 | apt-get update; \ 18 | apt-get -y install libsnappy1v5; \ 19 | apt-get -y install netcat net-tools; \ 20 | apt-get -y install gettext-base; \ 21 | rm -rf /var/lib/apt/lists/* 22 | 23 | # Grab gosu for easy step-down from root 24 | ENV GOSU_VERSION 1.11 25 | RUN set -ex; \ 26 | wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$(dpkg --print-architecture)"; \ 27 | wget -nv -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$(dpkg --print-architecture).asc"; \ 28 | export GNUPGHOME="$(mktemp -d)"; \ 29 | rm -rf "$GNUPGHOME" /usr/local/bin/gosu.asc; \ 30 | chmod +x /usr/local/bin/gosu; \ 31 | gosu nobody true 32 | 33 | # Install Maven 34 | ENV MAVEN_VERSION 3.6.1 35 | RUN \ 36 | wget https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/$MAVEN_VERSION/apache-maven-$MAVEN_VERSION-bin.tar.gz; \ 37 | tar -zxvf apache-maven-$MAVEN_VERSION-bin.tar.gz; \ 38 | mv apache-maven-$MAVEN_VERSION $MAVEN_HOME; \ 39 | rm apache-maven-$MAVEN_VERSION-bin.tar.gz 40 | 41 | WORKDIR /code 42 | 43 | RUN \ 44 | mvn package; \ 45 | mkdir -p /opt/flink/flink-web-upload; \ 46 | cp flink-conf.yaml /usr/local/; \ 47 | cp /code/target/operator-test-app-1.0.0-SNAPSHOT.jar /opt/flink/flink-web-upload/ 48 | 49 | RUN groupadd --system --gid=9999 flink && \ 50 | useradd --system --home-dir $FLINK_HOME --uid=9999 --gid=flink flink 51 | WORKDIR $FLINK_HOME 52 | 53 | ENV FLINK_URL_FILE_PATH=flink/flink-${FLINK_VERSION}/flink-${FLINK_VERSION}-bin-${HADOOP_SCALA_VARIANT}.tgz 54 | ENV FLINK_TGZ_URL=https://archive.apache.org/dist/$FLINK_URL_FILE_PATH 55 | 56 | # Install Flink 57 | RUN set -ex; \ 58 | wget -nv -O flink.tgz "$FLINK_TGZ_URL"; \ 59 | \ 60 | tar -xf flink.tgz --strip-components=1; \ 61 | rm flink.tgz; \ 62 | \ 63 | chown -R flink:flink .; 64 | 65 | # Needed on OpenShift for the entrypoint script to work 66 | RUN chmod -R 777 /opt/flink 67 | 68 | # control script expects manifest.yaml at this location 69 | RUN chown -R flink:flink /var 70 | COPY docker-entrypoint.sh / 71 | ENTRYPOINT ["/docker-entrypoint.sh"] 72 | EXPOSE 6123 8081 73 | CMD ["local"] 74 | -------------------------------------------------------------------------------- /integ/operator-test-app/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | drop_privs_cmd() { 4 | if [ $(id -u) != 0 ]; then 5 | # Don't need to drop privs if EUID != 0 6 | return 7 | elif [ -x /sbin/su-exec ]; then 8 | # Alpine 9 | echo su-exec 10 | else 11 | # Others 12 | echo gosu flink 13 | fi 14 | } 15 | 16 | # Add in extra configs set by the operator 17 | if [ -n "$FLINK_PROPERTIES" ]; then 18 | echo "$FLINK_PROPERTIES" >> "/usr/local/flink-conf.yaml" 19 | fi 20 | 21 | envsubst < /usr/local/flink-conf.yaml > $FLINK_HOME/conf/flink-conf.yaml 22 | 23 | COMMAND=$@ 24 | 25 | if [ $# -lt 1 ]; then 26 | COMMAND="local" 27 | fi 28 | 29 | if [ "$COMMAND" = "help" ]; then 30 | echo "Usage: $(basename "$0") (jobmanager|taskmanager|local|help)" 31 | exit 0 32 | elif [ "$FLINK_DEPLOYMENT_TYPE" = "jobmanager" ]; then 33 | echo "Starting Job Manager" 34 | echo "config file: " && grep '^[^\n#]' "$FLINK_HOME/conf/flink-conf.yaml" 35 | exec $(drop_privs_cmd) "$FLINK_HOME/bin/jobmanager.sh" start-foreground 36 | elif [ "$FLINK_DEPLOYMENT_TYPE" = "taskmanager" ]; then 37 | echo "Starting Task Manager" 38 | echo "config file: " && grep '^[^\n#]' "$FLINK_HOME/conf/flink-conf.yaml" 39 | exec $(drop_privs_cmd) "$FLINK_HOME/bin/taskmanager.sh" start-foreground 40 | elif [ "$COMMAND" = "local" ]; then 41 | echo "Starting local cluster" 42 | exec $(drop_privs_cmd) "$FLINK_HOME/bin/jobmanager.sh" start-foreground local 43 | fi 44 | 45 | exec "$@" 46 | -------------------------------------------------------------------------------- /integ/operator-test-app/flink-conf.yaml: -------------------------------------------------------------------------------- 1 | jobmanager.web.submit.enable: true 2 | jobmanager.web.log.path: /var/log/jobmanager/current 3 | 4 | taskmanager.log.path: /var/log/taskmanager/current 5 | taskmanager.exit-on-fatal-akka-error: true 6 | taskmanager.network.memory.max: 2147483648 7 | taskmanager.network.memory.fraction: 0.125 8 | 9 | # Akka config 10 | akka.framesize: 20MB 11 | parallelism.default: 1 12 | 13 | web.upload.dir: /opt/flink 14 | 15 | # State backend config 16 | state.backend: rocksdb 17 | state.checkpoints.num-retained: 4 18 | 19 | # Restart strategy 20 | restart-strategy: fixed-delay 21 | restart-strategy.fixed-delay.delay: 0s 22 | restart-strategy.fixed-delay.attempts: 2147483647 23 | 24 | # These parameters control how often TaskManagers try to connect to a JobManager. 25 | # These values are set a bit lower than the defaults to make recovery and cluster restarts 26 | # a bit faster 27 | taskmanager.maxRegistrationDuration: Inf 28 | taskmanager.initial-registration-pause: 500 ms 29 | taskmanager.max-registration-pause: 5 s 30 | taskmanager.refused-registration-pause: 5 s 31 | -------------------------------------------------------------------------------- /integ/operator-test-app/pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 4.0.0 6 | com.lyft 7 | operator-test-app 8 | 1.0.0-SNAPSHOT 9 | jar 10 | 11 | 12 | 1.8 13 | 1.8 14 | 15 | 16 | operator-test-app 17 | 18 | 19 | 20 | org.apache.flink 21 | flink-java 22 | 1.8.1 23 | 24 | 25 | org.apache.flink 26 | flink-streaming-java_2.11 27 | 1.8.1 28 | 29 | 30 | org.apache.flink 31 | flink-streaming-scala_2.11 32 | 1.8.1 33 | provided 34 | 35 | 36 | 37 | org.apache.flink 38 | flink-clients_2.11 39 | 1.8.1 40 | provided 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /integ/operator-test-app/src/main/java/com/lyft/OperatorTestApp.java: -------------------------------------------------------------------------------- 1 | package com.lyft; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.nio.file.Files; 6 | import java.util.Collections; 7 | import java.util.List; 8 | import java.util.concurrent.atomic.AtomicBoolean; 9 | import org.apache.flink.api.common.functions.MapFunction; 10 | import org.apache.flink.api.common.typeinfo.TypeHint; 11 | import org.apache.flink.api.common.typeinfo.TypeInformation; 12 | import org.apache.flink.api.java.tuple.Tuple2; 13 | import org.apache.flink.streaming.api.CheckpointingMode; 14 | import org.apache.flink.streaming.api.TimeCharacteristic; 15 | import org.apache.flink.streaming.api.checkpoint.ListCheckpointed; 16 | import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; 17 | import org.apache.flink.streaming.api.environment.CheckpointConfig; 18 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 19 | import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; 20 | import org.apache.flink.streaming.api.windowing.time.Time; 21 | import org.slf4j.Logger; 22 | import org.slf4j.LoggerFactory; 23 | 24 | public class OperatorTestApp { 25 | public static class StreamingImpulseSource extends RichParallelSourceFunction implements 26 | ListCheckpointed { 27 | Logger log = LoggerFactory.getLogger(StreamingImpulseSource.class); 28 | 29 | private final AtomicBoolean cancelled = new AtomicBoolean(false); 30 | private long count = 0; 31 | private final int intervalMillis; 32 | 33 | public StreamingImpulseSource(int intervalMillis) { 34 | this.intervalMillis = intervalMillis; 35 | } 36 | 37 | @Override 38 | public void run(SourceContext ctx) throws IOException { 39 | while (!cancelled.get()) { 40 | synchronized (ctx.getCheckpointLock()) { 41 | ctx.collect(count++); 42 | } 43 | 44 | try { 45 | if (intervalMillis > 0) { 46 | Thread.sleep(intervalMillis); 47 | } 48 | } catch (InterruptedException e) { 49 | // pass 50 | } 51 | } 52 | 53 | } 54 | 55 | @Override 56 | public void cancel() { 57 | this.cancelled.set(true); 58 | } 59 | 60 | @Override 61 | public List snapshotState(long checkpointId, long timestamp) throws Exception { 62 | File file = new File("/checkpoints/checkpoint_delay"); 63 | if (file.exists()) { 64 | String checkpointDelay = new String(Files.readAllBytes(file.toPath())) 65 | .replaceAll("\n", ""); 66 | int delay = Integer.valueOf(checkpointDelay); 67 | log.info("Waiting {} milliseconds", delay); 68 | System.out.println(String.format("PRINT Waiting %d milliseconds", delay)); 69 | 70 | try { 71 | Thread.sleep(delay); 72 | } catch (InterruptedException e) { 73 | log.error("Interrupted", e); 74 | } 75 | } 76 | 77 | return Collections.singletonList(count); 78 | } 79 | 80 | @Override 81 | public void restoreState(List state) throws Exception { 82 | if (!state.isEmpty()) { 83 | count = state.get(0); 84 | } 85 | } 86 | } 87 | 88 | public static class MaybeFail implements MapFunction { 89 | 90 | @Override 91 | public Long map(Long x) throws Exception { 92 | if (new File("/checkpoints/fail").exists() && !Settings.skipInducedFailure()) { 93 | throw new RuntimeException("FAILED!!!"); 94 | } 95 | 96 | return x; 97 | } 98 | } 99 | 100 | public static void main(String[] args) throws Exception { 101 | Logger log = LoggerFactory.getLogger(OperatorTestApp.class); 102 | 103 | log.info("Submitting job..."); 104 | 105 | String uid = "default"; 106 | if (args.length > 0) { 107 | uid = args[0]; 108 | } 109 | 110 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 111 | 112 | configureEnvironment(env); 113 | 114 | SingleOutputStreamOperator dataStream = env 115 | .addSource(new StreamingImpulseSource(1000)) 116 | .map(new MaybeFail()) 117 | .map(x -> Tuple2.of(0, x)) 118 | .returns(TypeInformation.of(new TypeHint>(){})) 119 | .keyBy(0) 120 | .timeWindow(Time.seconds(10)) 121 | .max(1) 122 | .uid(uid) 123 | .map(x -> x.f1); 124 | 125 | dataStream.print(); 126 | 127 | env.execute("Window Count"); 128 | } 129 | 130 | private static void configureEnvironment(StreamExecutionEnvironment env) { 131 | env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); 132 | env.getCheckpointConfig().setCheckpointTimeout(10_000); 133 | env.enableCheckpointing(5_000); 134 | env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime); 135 | 136 | if (System.getenv("EXTERNAL_CHECKPOINT") != null) { 137 | env.getCheckpointConfig() 138 | .enableExternalizedCheckpoints( 139 | CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); 140 | } 141 | 142 | // It is normally safe to use this setting and it can be a big performance improvement as it 143 | // skips a per-event serializer copy. The caveat is that you must treat your data objects as 144 | // immutable. 145 | env.getConfig().enableObjectReuse(); 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /integ/operator-test-app/src/main/java/com/lyft/Settings.java: -------------------------------------------------------------------------------- 1 | package com.lyft; 2 | 3 | public class Settings { 4 | private static final String SKIP_INDUCED_FAILURE = "SKIP_INDUCED_FAILURE"; 5 | 6 | public static boolean skipInducedFailure() { 7 | return System.getenv(SKIP_INDUCED_FAILURE) != null && System.getenv(SKIP_INDUCED_FAILURE).equals("true"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /integ/scaleup_test.go: -------------------------------------------------------------------------------- 1 | package integ 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | 8 | "github.com/lyft/flinkk8soperator/integ/log" 9 | "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 10 | . "gopkg.in/check.v1" 11 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 12 | ) 13 | 14 | // TODO: https://github.com/lyft/flinkk8soperator/issues/278 15 | func (s *IntegSuite) TestInPlaceScaleUp(c *C) { 16 | 17 | log.Info("Starting test TestInPlaceScaleUp") 18 | c.Skip("Skipping due to memory constraints in CI") 19 | const finalizer = "scaleup.finalizers.test.com" 20 | const testName = "test_in_place_scale_up" 21 | 22 | // start a simple app 23 | config, err := s.Util.ReadFlinkApplication("test_app.yaml") 24 | c.Assert(err, IsNil, Commentf("Failed to read test app yaml")) 25 | 26 | config.Spec.ScaleMode = "InPlace" 27 | config.Spec.Parallelism = 2 28 | config.ObjectMeta.Name = "inplace" 29 | config.ObjectMeta.Labels["integTest"] = testName 30 | // add a finalizer so that the flinkapplication won't be deleted until we've had a chance to look at it 31 | config.Finalizers = append(config.Finalizers, finalizer) 32 | 33 | c.Assert(s.Util.CreateFlinkApplication(config), IsNil, 34 | Commentf("Failed to create flink application")) 35 | 36 | c.Assert(s.Util.WaitForPhase(config.Name, v1beta1.FlinkApplicationRunning, v1beta1.FlinkApplicationDeployFailed), IsNil) 37 | c.Assert(s.Util.WaitForAllTasksRunning(config.Name), IsNil) 38 | 39 | pods, err := s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). 40 | List(context.Background(), v1.ListOptions{LabelSelector: "integTest=" + testName}) 41 | c.Assert(err, IsNil) 42 | c.Assert(len(pods.Items), Equals, 2) 43 | for _, pod := range pods.Items { 44 | c.Assert(pod.Spec.Containers[0].Image, Equals, config.Spec.Image) 45 | } 46 | 47 | deployments, err := s.Util.KubeClient.AppsV1().Deployments(s.Util.Namespace.Name). 48 | List(context.Background(), v1.ListOptions{LabelSelector: "flink-app=inplace,flink-deployment-type=taskmanager"}) 49 | c.Assert(err, IsNil) 50 | c.Assert(len(deployments.Items), Equals, 1) 51 | deployment := deployments.Items[0] 52 | 53 | log.Info("Application started successfully") 54 | 55 | // test updating the app with a new scale 56 | _, err = s.Util.Update("inplace", func(app *v1beta1.FlinkApplication) { 57 | app.Spec.Parallelism = 4 58 | }) 59 | c.Assert(err, IsNil) 60 | 61 | c.Assert(s.Util.WaitForPhase("inplace", v1beta1.FlinkApplicationRescaling, v1beta1.FlinkApplicationDeployFailed), IsNil) 62 | c.Assert(s.Util.WaitForPhase("inplace", v1beta1.FlinkApplicationSavepointing, v1beta1.FlinkApplicationDeployFailed), IsNil) 63 | c.Assert(s.Util.WaitForPhase("inplace", v1beta1.FlinkApplicationRunning, v1beta1.FlinkApplicationDeployFailed), IsNil) 64 | c.Assert(s.Util.WaitForAllTasksRunning("inplace"), IsNil) 65 | 66 | log.Info("Rescaled job started successfully") 67 | newApp, err := s.Util.GetFlinkApplication(config.Name) 68 | c.Assert(err, IsNil) 69 | 70 | // check that we savepointed and restored correctly 71 | endpoint := fmt.Sprintf("jobs/%s/checkpoints", newApp.Status.JobStatus.JobID) 72 | res, err := s.Util.FlinkAPIGet(newApp, endpoint) 73 | c.Assert(err, IsNil) 74 | 75 | body := res.(map[string]interface{}) 76 | restored := (body["latest"].(map[string]interface{}))["restored"] 77 | c.Assert(restored, NotNil) 78 | 79 | c.Assert(restored.(map[string]interface{})["is_savepoint"], Equals, true) 80 | 81 | // check that we have the correct number of total pods 82 | pods, err = s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). 83 | List(context.Background(), v1.ListOptions{LabelSelector: "integTest=" + testName}) 84 | c.Assert(err, IsNil) 85 | c.Assert(len(pods.Items), Equals, 3) 86 | 87 | // check that we are still using the same deploymnet 88 | deployments2, err := s.Util.KubeClient.AppsV1().Deployments(s.Util.Namespace.Name). 89 | List(context.Background(), v1.ListOptions{LabelSelector: "flink-app=inplace,flink-deployment-type=taskmanager"}) 90 | c.Assert(err, IsNil) 91 | c.Assert(len(deployments2.Items), Equals, 1) 92 | deployment2 := deployments.Items[0] 93 | c.Assert(deployment2.Name, Equals, deployment.Name) 94 | 95 | // ensure that we can now proceed to a normal deployment 96 | newApp = updateAndValidate(c, s, config.Name, func(app *v1beta1.FlinkApplication) { 97 | app.Spec.Image = NewImage 98 | }, v1beta1.FlinkApplicationDeployFailed) 99 | c.Assert(newApp.Spec.Image, Equals, NewImage) 100 | pods, err = s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). 101 | List(context.Background(), v1.ListOptions{LabelSelector: "integTest=" + testName}) 102 | c.Assert(err, IsNil) 103 | c.Assert(len(pods.Items), Equals, 3) 104 | for _, pod := range pods.Items { 105 | c.Assert(pod.Spec.Containers[0].Image, Equals, NewImage) 106 | } 107 | 108 | // delete the application and ensure everything is cleaned up successfully 109 | c.Assert(s.Util.FlinkApps().Delete(context.Background(), config.Name, v1.DeleteOptions{}), IsNil) 110 | 111 | // validate that a savepoint was taken and the job was cancelled 112 | var app *v1beta1.FlinkApplication 113 | for { 114 | app, err = s.Util.GetFlinkApplication(config.Name) 115 | c.Assert(err, IsNil) 116 | 117 | if len(app.Finalizers) == 1 && app.Finalizers[0] == finalizer { 118 | break 119 | } 120 | time.Sleep(100 * time.Millisecond) 121 | } 122 | 123 | c.Assert(app.Status.SavepointPath, NotNil) 124 | job := func() map[string]interface{} { 125 | jobs, _ := s.Util.FlinkAPIGet(app, "/jobs") 126 | jobMap := jobs.(map[string]interface{}) 127 | jobList := jobMap["jobs"].([]interface{}) 128 | for _, j := range jobList { 129 | job := j.(map[string]interface{}) 130 | if job["id"] == app.Status.JobStatus.JobID { 131 | return job 132 | } 133 | } 134 | return nil 135 | }() 136 | 137 | fmt.Printf("test job = %v", job) 138 | c.Assert(job["status"], Equals, "CANCELED") 139 | 140 | // delete our finalizer 141 | app.Finalizers = []string{} 142 | _, err = s.Util.FlinkApps().Update(context.Background(), app, v1.UpdateOptions{}) 143 | c.Assert(err, IsNil) 144 | 145 | // wait until all pods are gone 146 | for { 147 | pods, err = s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). 148 | List(context.Background(), v1.ListOptions{LabelSelector: "integTest=" + testName}) 149 | c.Assert(err, IsNil) 150 | if len(pods.Items) == 0 { 151 | break 152 | } 153 | time.Sleep(100 * time.Millisecond) 154 | } 155 | log.Info("All pods torn down") 156 | log.Info("Completed test TestInPlaceScaleUp") 157 | } 158 | -------------------------------------------------------------------------------- /integ/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Test App Setup 4 | 5 | # TODO: upgrade flink test app from 1.8 6 | cd integ/operator-test-app 7 | export TEST_APP_IMAGE=operator-test-app:$(git rev-parse HEAD) 8 | docker build -t $TEST_APP_IMAGE . 9 | docker tag $TEST_APP_IMAGE operator-test-app:local.1 10 | docker tag $TEST_APP_IMAGE operator-test-app:local.2 11 | minikube image load operator-test-app:local.1 12 | minikube image load operator-test-app:local.2 13 | 14 | cd ../../ 15 | 16 | # Operator Setup 17 | 18 | export DOCKER_IMAGE=flinkk8soperator:$(git rev-parse HEAD) 19 | export OPERATOR_IMAGE=flinkk8soperator:local 20 | 21 | docker build -t $DOCKER_IMAGE . 22 | docker tag $DOCKER_IMAGE $OPERATOR_IMAGE 23 | minikube image load $OPERATOR_IMAGE 24 | 25 | kubectl proxy --port 8001 & 26 | -------------------------------------------------------------------------------- /integ/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | export INTEGRATION=true 6 | export OPERATOR_IMAGE=flinkk8soperator:local 7 | 8 | cd $(dirname "$0") 9 | go test -p 1 -timeout 60m -check.vv IntegSuite 10 | -------------------------------------------------------------------------------- /integ/test_app.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: flink.k8s.io/v1beta1 2 | kind: FlinkApplication 3 | metadata: 4 | name: operator-test-app 5 | annotations: 6 | labels: 7 | environment: development 8 | spec: 9 | image: operator-test-app:local.1 10 | imagePullPolicy: IfNotPresent 11 | imagePullSecrets: 12 | - name: dockerhub 13 | flinkConfig: 14 | state.backend.fs.checkpointdir: file:///checkpoints/flink/checkpoints 15 | state.checkpoints.dir: file:///checkpoints/flink/externalized-checkpoints 16 | state.savepoints.dir: file:///checkpoints/flink/savepoints 17 | env.java.opts.jobmanager: "-XX:+UseG1GC" 18 | jobManagerConfig: 19 | systemMemoryFraction: 0.2 20 | resources: 21 | requests: 22 | memory: "400Mi" 23 | cpu: "0.2" 24 | limits: 25 | memory: "800Mi" 26 | cpu: "0.2" 27 | replicas: 1 28 | taskManagerConfig: 29 | taskSlots: 2 30 | systemMemoryFraction: 0.5 31 | resources: 32 | requests: 33 | memory: "800Mi" 34 | cpu: "0.2" 35 | limits: 36 | memory: "800Mi" 37 | cpu: "0.2" 38 | volumeMounts: 39 | - mountPath: /checkpoints 40 | name: checkpoints 41 | volumes: 42 | - name: checkpoints 43 | hostPath: 44 | path: /tmp/checkpoints 45 | type: Directory 46 | flinkVersion: "1.11" 47 | deploymentMode: Dual 48 | jarName: "operator-test-app-1.0.0-SNAPSHOT.jar" 49 | parallelism: 2 50 | entryClass: "com.lyft.OperatorTestApp" 51 | -------------------------------------------------------------------------------- /local_config.yaml: -------------------------------------------------------------------------------- 1 | # This is default configuration file. 2 | # Real configuration when running inside K8s (local or otherwise) lives in a ConfigMap 3 | # The operator will replace "job" field with the correct flink job name 4 | # ingressUrlFormat: "{{$jobCluster}}.xyz.net" 5 | operator: 6 | useKubectlProxy: true 7 | containerNameFormat: "%s-unknown" 8 | metricsPrefix: "flinkk8soperator" 9 | resyncPeriod: 10s 10 | proxyPort: 8001 11 | baseBackoffDuration: 100ms 12 | maxBackoffDuration: 30s 13 | maxErrDuration: 1m 14 | logger: 15 | show-source: true 16 | level: 5 17 | formatter: 18 | type: text 19 | -------------------------------------------------------------------------------- /pkg/apis/app/addtoscheme_v1beta1.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Lyft. All rights reserved. 3 | */ 4 | 5 | package apis 6 | 7 | import ( 8 | "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 9 | ) 10 | 11 | func init() { 12 | // Register the types with the Scheme so the components can map objects to GroupVersionKinds and back 13 | AddToSchemes = append(AddToSchemes, v1beta1.SchemeBuilder.AddToScheme) 14 | } 15 | -------------------------------------------------------------------------------- /pkg/apis/app/apis.go: -------------------------------------------------------------------------------- 1 | package apis 2 | 3 | import ( 4 | "k8s.io/apimachinery/pkg/runtime" 5 | ) 6 | 7 | // AddToSchemes may be used to add all resources defined in the project to a Scheme 8 | var AddToSchemes runtime.SchemeBuilder 9 | 10 | // AddToScheme adds all Resources to the Scheme 11 | func AddToScheme(s *runtime.Scheme) error { 12 | return AddToSchemes.AddToScheme(s) 13 | } 14 | -------------------------------------------------------------------------------- /pkg/apis/app/v1alpha1/doc.go: -------------------------------------------------------------------------------- 1 | // +k8s:deepcopy-gen=package 2 | // +groupName=flink.k8s.io 3 | package v1alpha1 4 | -------------------------------------------------------------------------------- /pkg/apis/app/v1alpha1/register.go: -------------------------------------------------------------------------------- 1 | package v1alpha1 2 | 3 | import ( 4 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 5 | "k8s.io/apimachinery/pkg/runtime" 6 | "k8s.io/apimachinery/pkg/runtime/schema" 7 | ) 8 | 9 | const ( 10 | version = "v1alpha1" 11 | groupName = "flink.k8s.io" 12 | 13 | FlinkApplicationKind = "FlinkApplication" 14 | ) 15 | 16 | var ( 17 | SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) 18 | AddToScheme = SchemeBuilder.AddToScheme 19 | // SchemeGroupVersion is the group version used to register these objects. 20 | SchemeGroupVersion = schema.GroupVersion{Group: groupName, Version: version} 21 | ) 22 | 23 | // GetKind takes an unqualified kind and returns back a Group qualified GroupKind 24 | func Kind(kind string) schema.GroupKind { 25 | return SchemeGroupVersion.WithKind(kind).GroupKind() 26 | } 27 | 28 | // Resource takes an unqualified resource and returns a Group qualified GroupResource 29 | func Resource(resource string) schema.GroupResource { 30 | return SchemeGroupVersion.WithResource(resource).GroupResource() 31 | } 32 | 33 | // addKnownTypes adds the set of types defined in this package to the supplied scheme. 34 | func addKnownTypes(scheme *runtime.Scheme) error { 35 | scheme.AddKnownTypes(SchemeGroupVersion, 36 | &FlinkApplication{}, 37 | &FlinkApplicationList{}, 38 | ) 39 | 40 | metav1.AddToGroupVersion(scheme, SchemeGroupVersion) 41 | return nil 42 | } 43 | -------------------------------------------------------------------------------- /pkg/apis/app/v1beta1/doc.go: -------------------------------------------------------------------------------- 1 | // +k8s:deepcopy-gen=package 2 | // +groupName=flink.k8s.io 3 | package v1beta1 4 | -------------------------------------------------------------------------------- /pkg/apis/app/v1beta1/register.go: -------------------------------------------------------------------------------- 1 | package v1beta1 2 | 3 | import ( 4 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 5 | "k8s.io/apimachinery/pkg/runtime" 6 | "k8s.io/apimachinery/pkg/runtime/schema" 7 | ) 8 | 9 | const ( 10 | version = "v1beta1" 11 | groupName = "flink.k8s.io" 12 | 13 | FlinkApplicationKind = "FlinkApplication" 14 | ) 15 | 16 | var ( 17 | SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) 18 | AddToScheme = SchemeBuilder.AddToScheme 19 | // SchemeGroupVersion is the group version used to register these objects. 20 | SchemeGroupVersion = schema.GroupVersion{Group: groupName, Version: version} 21 | ) 22 | 23 | // GetKind takes an unqualified kind and returns back a Group qualified GroupKind 24 | func Kind(kind string) schema.GroupKind { 25 | return SchemeGroupVersion.WithKind(kind).GroupKind() 26 | } 27 | 28 | // Resource takes an unqualified resource and returns a Group qualified GroupResource 29 | func Resource(resource string) schema.GroupResource { 30 | return SchemeGroupVersion.WithResource(resource).GroupResource() 31 | } 32 | 33 | // addKnownTypes adds the set of types defined in this package to the supplied scheme. 34 | func addKnownTypes(scheme *runtime.Scheme) error { 35 | scheme.AddKnownTypes(SchemeGroupVersion, 36 | &FlinkApplication{}, 37 | &FlinkApplicationList{}, 38 | ) 39 | 40 | metav1.AddToGroupVersion(scheme, SchemeGroupVersion) 41 | return nil 42 | } 43 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/clientset.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package versioned 4 | 5 | import ( 6 | "fmt" 7 | "net/http" 8 | 9 | flinkv1beta1 "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned/typed/app/v1beta1" 10 | discovery "k8s.io/client-go/discovery" 11 | rest "k8s.io/client-go/rest" 12 | flowcontrol "k8s.io/client-go/util/flowcontrol" 13 | ) 14 | 15 | type Interface interface { 16 | Discovery() discovery.DiscoveryInterface 17 | FlinkV1beta1() flinkv1beta1.FlinkV1beta1Interface 18 | } 19 | 20 | // Clientset contains the clients for groups. Each group has exactly one 21 | // version included in a Clientset. 22 | type Clientset struct { 23 | *discovery.DiscoveryClient 24 | flinkV1beta1 *flinkv1beta1.FlinkV1beta1Client 25 | } 26 | 27 | // FlinkV1beta1 retrieves the FlinkV1beta1Client 28 | func (c *Clientset) FlinkV1beta1() flinkv1beta1.FlinkV1beta1Interface { 29 | return c.flinkV1beta1 30 | } 31 | 32 | // Discovery retrieves the DiscoveryClient 33 | func (c *Clientset) Discovery() discovery.DiscoveryInterface { 34 | if c == nil { 35 | return nil 36 | } 37 | return c.DiscoveryClient 38 | } 39 | 40 | // NewForConfig creates a new Clientset for the given config. 41 | // If config's RateLimiter is not set and QPS and Burst are acceptable, 42 | // NewForConfig will generate a rate-limiter in configShallowCopy. 43 | // NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), 44 | // where httpClient was generated with rest.HTTPClientFor(c). 45 | func NewForConfig(c *rest.Config) (*Clientset, error) { 46 | configShallowCopy := *c 47 | 48 | // share the transport between all clients 49 | httpClient, err := rest.HTTPClientFor(&configShallowCopy) 50 | if err != nil { 51 | return nil, err 52 | } 53 | 54 | return NewForConfigAndClient(&configShallowCopy, httpClient) 55 | } 56 | 57 | // NewForConfigAndClient creates a new Clientset for the given config and http client. 58 | // Note the http client provided takes precedence over the configured transport values. 59 | // If config's RateLimiter is not set and QPS and Burst are acceptable, 60 | // NewForConfigAndClient will generate a rate-limiter in configShallowCopy. 61 | func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, error) { 62 | configShallowCopy := *c 63 | if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 { 64 | if configShallowCopy.Burst <= 0 { 65 | return nil, fmt.Errorf("burst is required to be greater than 0 when RateLimiter is not set and QPS is set to greater than 0") 66 | } 67 | configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst) 68 | } 69 | 70 | var cs Clientset 71 | var err error 72 | cs.flinkV1beta1, err = flinkv1beta1.NewForConfigAndClient(&configShallowCopy, httpClient) 73 | if err != nil { 74 | return nil, err 75 | } 76 | 77 | cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfigAndClient(&configShallowCopy, httpClient) 78 | if err != nil { 79 | return nil, err 80 | } 81 | return &cs, nil 82 | } 83 | 84 | // NewForConfigOrDie creates a new Clientset for the given config and 85 | // panics if there is an error in the config. 86 | func NewForConfigOrDie(c *rest.Config) *Clientset { 87 | cs, err := NewForConfig(c) 88 | if err != nil { 89 | panic(err) 90 | } 91 | return cs 92 | } 93 | 94 | // New creates a new Clientset for the given RESTClient. 95 | func New(c rest.Interface) *Clientset { 96 | var cs Clientset 97 | cs.flinkV1beta1 = flinkv1beta1.New(c) 98 | 99 | cs.DiscoveryClient = discovery.NewDiscoveryClient(c) 100 | return &cs 101 | } 102 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/doc.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | // This package has the automatically generated clientset. 4 | package versioned 5 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/fake/clientset_generated.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package fake 4 | 5 | import ( 6 | clientset "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned" 7 | flinkv1beta1 "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned/typed/app/v1beta1" 8 | fakeflinkv1beta1 "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned/typed/app/v1beta1/fake" 9 | "k8s.io/apimachinery/pkg/runtime" 10 | "k8s.io/apimachinery/pkg/watch" 11 | "k8s.io/client-go/discovery" 12 | fakediscovery "k8s.io/client-go/discovery/fake" 13 | "k8s.io/client-go/testing" 14 | ) 15 | 16 | // NewSimpleClientset returns a clientset that will respond with the provided objects. 17 | // It's backed by a very simple object tracker that processes creates, updates and deletions as-is, 18 | // without applying any validations and/or defaults. It shouldn't be considered a replacement 19 | // for a real clientset and is mostly useful in simple unit tests. 20 | func NewSimpleClientset(objects ...runtime.Object) *Clientset { 21 | o := testing.NewObjectTracker(scheme, codecs.UniversalDecoder()) 22 | for _, obj := range objects { 23 | if err := o.Add(obj); err != nil { 24 | panic(err) 25 | } 26 | } 27 | 28 | cs := &Clientset{tracker: o} 29 | cs.discovery = &fakediscovery.FakeDiscovery{Fake: &cs.Fake} 30 | cs.AddReactor("*", "*", testing.ObjectReaction(o)) 31 | cs.AddWatchReactor("*", func(action testing.Action) (handled bool, ret watch.Interface, err error) { 32 | gvr := action.GetResource() 33 | ns := action.GetNamespace() 34 | watch, err := o.Watch(gvr, ns) 35 | if err != nil { 36 | return false, nil, err 37 | } 38 | return true, watch, nil 39 | }) 40 | 41 | return cs 42 | } 43 | 44 | // Clientset implements clientset.Interface. Meant to be embedded into a 45 | // struct to get a default implementation. This makes faking out just the method 46 | // you want to test easier. 47 | type Clientset struct { 48 | testing.Fake 49 | discovery *fakediscovery.FakeDiscovery 50 | tracker testing.ObjectTracker 51 | } 52 | 53 | func (c *Clientset) Discovery() discovery.DiscoveryInterface { 54 | return c.discovery 55 | } 56 | 57 | func (c *Clientset) Tracker() testing.ObjectTracker { 58 | return c.tracker 59 | } 60 | 61 | var ( 62 | _ clientset.Interface = &Clientset{} 63 | _ testing.FakeClient = &Clientset{} 64 | ) 65 | 66 | // FlinkV1beta1 retrieves the FlinkV1beta1Client 67 | func (c *Clientset) FlinkV1beta1() flinkv1beta1.FlinkV1beta1Interface { 68 | return &fakeflinkv1beta1.FakeFlinkV1beta1{Fake: &c.Fake} 69 | } 70 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/fake/doc.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | // This package has the automatically generated fake clientset. 4 | package fake 5 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/fake/register.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package fake 4 | 5 | import ( 6 | flinkv1beta1 "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 7 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 8 | runtime "k8s.io/apimachinery/pkg/runtime" 9 | schema "k8s.io/apimachinery/pkg/runtime/schema" 10 | serializer "k8s.io/apimachinery/pkg/runtime/serializer" 11 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 12 | ) 13 | 14 | var scheme = runtime.NewScheme() 15 | var codecs = serializer.NewCodecFactory(scheme) 16 | 17 | var localSchemeBuilder = runtime.SchemeBuilder{ 18 | flinkv1beta1.AddToScheme, 19 | } 20 | 21 | // AddToScheme adds all types of this clientset into the given scheme. This allows composition 22 | // of clientsets, like in: 23 | // 24 | // import ( 25 | // "k8s.io/client-go/kubernetes" 26 | // clientsetscheme "k8s.io/client-go/kubernetes/scheme" 27 | // aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" 28 | // ) 29 | // 30 | // kclientset, _ := kubernetes.NewForConfig(c) 31 | // _ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) 32 | // 33 | // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types 34 | // correctly. 35 | var AddToScheme = localSchemeBuilder.AddToScheme 36 | 37 | func init() { 38 | v1.AddToGroupVersion(scheme, schema.GroupVersion{Version: "v1"}) 39 | utilruntime.Must(AddToScheme(scheme)) 40 | } 41 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/scheme/doc.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | // This package contains the scheme of the automatically generated clientset. 4 | package scheme 5 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/scheme/register.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package scheme 4 | 5 | import ( 6 | flinkv1beta1 "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 7 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 8 | runtime "k8s.io/apimachinery/pkg/runtime" 9 | schema "k8s.io/apimachinery/pkg/runtime/schema" 10 | serializer "k8s.io/apimachinery/pkg/runtime/serializer" 11 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 12 | ) 13 | 14 | var Scheme = runtime.NewScheme() 15 | var Codecs = serializer.NewCodecFactory(Scheme) 16 | var ParameterCodec = runtime.NewParameterCodec(Scheme) 17 | var localSchemeBuilder = runtime.SchemeBuilder{ 18 | flinkv1beta1.AddToScheme, 19 | } 20 | 21 | // AddToScheme adds all types of this clientset into the given scheme. This allows composition 22 | // of clientsets, like in: 23 | // 24 | // import ( 25 | // "k8s.io/client-go/kubernetes" 26 | // clientsetscheme "k8s.io/client-go/kubernetes/scheme" 27 | // aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" 28 | // ) 29 | // 30 | // kclientset, _ := kubernetes.NewForConfig(c) 31 | // _ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) 32 | // 33 | // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types 34 | // correctly. 35 | var AddToScheme = localSchemeBuilder.AddToScheme 36 | 37 | func init() { 38 | v1.AddToGroupVersion(Scheme, schema.GroupVersion{Version: "v1"}) 39 | utilruntime.Must(AddToScheme(Scheme)) 40 | } 41 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/typed/app/v1beta1/app_client.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package v1beta1 4 | 5 | import ( 6 | "net/http" 7 | 8 | v1beta1 "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 9 | "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned/scheme" 10 | rest "k8s.io/client-go/rest" 11 | ) 12 | 13 | type FlinkV1beta1Interface interface { 14 | RESTClient() rest.Interface 15 | FlinkApplicationsGetter 16 | } 17 | 18 | // FlinkV1beta1Client is used to interact with features provided by the flink.k8s.io group. 19 | type FlinkV1beta1Client struct { 20 | restClient rest.Interface 21 | } 22 | 23 | func (c *FlinkV1beta1Client) FlinkApplications(namespace string) FlinkApplicationInterface { 24 | return newFlinkApplications(c, namespace) 25 | } 26 | 27 | // NewForConfig creates a new FlinkV1beta1Client for the given config. 28 | // NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), 29 | // where httpClient was generated with rest.HTTPClientFor(c). 30 | func NewForConfig(c *rest.Config) (*FlinkV1beta1Client, error) { 31 | config := *c 32 | if err := setConfigDefaults(&config); err != nil { 33 | return nil, err 34 | } 35 | httpClient, err := rest.HTTPClientFor(&config) 36 | if err != nil { 37 | return nil, err 38 | } 39 | return NewForConfigAndClient(&config, httpClient) 40 | } 41 | 42 | // NewForConfigAndClient creates a new FlinkV1beta1Client for the given config and http client. 43 | // Note the http client provided takes precedence over the configured transport values. 44 | func NewForConfigAndClient(c *rest.Config, h *http.Client) (*FlinkV1beta1Client, error) { 45 | config := *c 46 | if err := setConfigDefaults(&config); err != nil { 47 | return nil, err 48 | } 49 | client, err := rest.RESTClientForConfigAndClient(&config, h) 50 | if err != nil { 51 | return nil, err 52 | } 53 | return &FlinkV1beta1Client{client}, nil 54 | } 55 | 56 | // NewForConfigOrDie creates a new FlinkV1beta1Client for the given config and 57 | // panics if there is an error in the config. 58 | func NewForConfigOrDie(c *rest.Config) *FlinkV1beta1Client { 59 | client, err := NewForConfig(c) 60 | if err != nil { 61 | panic(err) 62 | } 63 | return client 64 | } 65 | 66 | // New creates a new FlinkV1beta1Client for the given RESTClient. 67 | func New(c rest.Interface) *FlinkV1beta1Client { 68 | return &FlinkV1beta1Client{c} 69 | } 70 | 71 | func setConfigDefaults(config *rest.Config) error { 72 | gv := v1beta1.SchemeGroupVersion 73 | config.GroupVersion = &gv 74 | config.APIPath = "/apis" 75 | config.NegotiatedSerializer = scheme.Codecs.WithoutConversion() 76 | 77 | if config.UserAgent == "" { 78 | config.UserAgent = rest.DefaultKubernetesUserAgent() 79 | } 80 | 81 | return nil 82 | } 83 | 84 | // RESTClient returns a RESTClient that is used to communicate 85 | // with API server by this client implementation. 86 | func (c *FlinkV1beta1Client) RESTClient() rest.Interface { 87 | if c == nil { 88 | return nil 89 | } 90 | return c.restClient 91 | } 92 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/typed/app/v1beta1/doc.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | // This package has the automatically generated typed clients. 4 | package v1beta1 5 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/typed/app/v1beta1/fake/doc.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | // Package fake has the automatically generated clients. 4 | package fake 5 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/typed/app/v1beta1/fake/fake_app_client.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package fake 4 | 5 | import ( 6 | v1beta1 "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned/typed/app/v1beta1" 7 | rest "k8s.io/client-go/rest" 8 | testing "k8s.io/client-go/testing" 9 | ) 10 | 11 | type FakeFlinkV1beta1 struct { 12 | *testing.Fake 13 | } 14 | 15 | func (c *FakeFlinkV1beta1) FlinkApplications(namespace string) v1beta1.FlinkApplicationInterface { 16 | return &FakeFlinkApplications{c, namespace} 17 | } 18 | 19 | // RESTClient returns a RESTClient that is used to communicate 20 | // with API server by this client implementation. 21 | func (c *FakeFlinkV1beta1) RESTClient() rest.Interface { 22 | var ret *rest.RESTClient 23 | return ret 24 | } 25 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/typed/app/v1beta1/fake/fake_flinkapplication.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package fake 4 | 5 | import ( 6 | "context" 7 | 8 | v1beta1 "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 9 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 10 | labels "k8s.io/apimachinery/pkg/labels" 11 | schema "k8s.io/apimachinery/pkg/runtime/schema" 12 | types "k8s.io/apimachinery/pkg/types" 13 | watch "k8s.io/apimachinery/pkg/watch" 14 | testing "k8s.io/client-go/testing" 15 | ) 16 | 17 | // FakeFlinkApplications implements FlinkApplicationInterface 18 | type FakeFlinkApplications struct { 19 | Fake *FakeFlinkV1beta1 20 | ns string 21 | } 22 | 23 | var flinkapplicationsResource = schema.GroupVersionResource{Group: "flink.k8s.io", Version: "v1beta1", Resource: "flinkapplications"} 24 | 25 | var flinkapplicationsKind = schema.GroupVersionKind{Group: "flink.k8s.io", Version: "v1beta1", Kind: "FlinkApplication"} 26 | 27 | // Get takes name of the flinkApplication, and returns the corresponding flinkApplication object, and an error if there is any. 28 | func (c *FakeFlinkApplications) Get(ctx context.Context, name string, options v1.GetOptions) (result *v1beta1.FlinkApplication, err error) { 29 | obj, err := c.Fake. 30 | Invokes(testing.NewGetAction(flinkapplicationsResource, c.ns, name), &v1beta1.FlinkApplication{}) 31 | 32 | if obj == nil { 33 | return nil, err 34 | } 35 | return obj.(*v1beta1.FlinkApplication), err 36 | } 37 | 38 | // List takes label and field selectors, and returns the list of FlinkApplications that match those selectors. 39 | func (c *FakeFlinkApplications) List(ctx context.Context, opts v1.ListOptions) (result *v1beta1.FlinkApplicationList, err error) { 40 | obj, err := c.Fake. 41 | Invokes(testing.NewListAction(flinkapplicationsResource, flinkapplicationsKind, c.ns, opts), &v1beta1.FlinkApplicationList{}) 42 | 43 | if obj == nil { 44 | return nil, err 45 | } 46 | 47 | label, _, _ := testing.ExtractFromListOptions(opts) 48 | if label == nil { 49 | label = labels.Everything() 50 | } 51 | list := &v1beta1.FlinkApplicationList{ListMeta: obj.(*v1beta1.FlinkApplicationList).ListMeta} 52 | for _, item := range obj.(*v1beta1.FlinkApplicationList).Items { 53 | if label.Matches(labels.Set(item.Labels)) { 54 | list.Items = append(list.Items, item) 55 | } 56 | } 57 | return list, err 58 | } 59 | 60 | // Watch returns a watch.Interface that watches the requested flinkApplications. 61 | func (c *FakeFlinkApplications) Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) { 62 | return c.Fake. 63 | InvokesWatch(testing.NewWatchAction(flinkapplicationsResource, c.ns, opts)) 64 | 65 | } 66 | 67 | // Create takes the representation of a flinkApplication and creates it. Returns the server's representation of the flinkApplication, and an error, if there is any. 68 | func (c *FakeFlinkApplications) Create(ctx context.Context, flinkApplication *v1beta1.FlinkApplication, opts v1.CreateOptions) (result *v1beta1.FlinkApplication, err error) { 69 | obj, err := c.Fake. 70 | Invokes(testing.NewCreateAction(flinkapplicationsResource, c.ns, flinkApplication), &v1beta1.FlinkApplication{}) 71 | 72 | if obj == nil { 73 | return nil, err 74 | } 75 | return obj.(*v1beta1.FlinkApplication), err 76 | } 77 | 78 | // Update takes the representation of a flinkApplication and updates it. Returns the server's representation of the flinkApplication, and an error, if there is any. 79 | func (c *FakeFlinkApplications) Update(ctx context.Context, flinkApplication *v1beta1.FlinkApplication, opts v1.UpdateOptions) (result *v1beta1.FlinkApplication, err error) { 80 | obj, err := c.Fake. 81 | Invokes(testing.NewUpdateAction(flinkapplicationsResource, c.ns, flinkApplication), &v1beta1.FlinkApplication{}) 82 | 83 | if obj == nil { 84 | return nil, err 85 | } 86 | return obj.(*v1beta1.FlinkApplication), err 87 | } 88 | 89 | // Delete takes name of the flinkApplication and deletes it. Returns an error if one occurs. 90 | func (c *FakeFlinkApplications) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error { 91 | _, err := c.Fake. 92 | Invokes(testing.NewDeleteActionWithOptions(flinkapplicationsResource, c.ns, name, opts), &v1beta1.FlinkApplication{}) 93 | 94 | return err 95 | } 96 | 97 | // DeleteCollection deletes a collection of objects. 98 | func (c *FakeFlinkApplications) DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error { 99 | action := testing.NewDeleteCollectionAction(flinkapplicationsResource, c.ns, listOpts) 100 | 101 | _, err := c.Fake.Invokes(action, &v1beta1.FlinkApplicationList{}) 102 | return err 103 | } 104 | 105 | // Patch applies the patch and returns the patched flinkApplication. 106 | func (c *FakeFlinkApplications) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *v1beta1.FlinkApplication, err error) { 107 | obj, err := c.Fake. 108 | Invokes(testing.NewPatchSubresourceAction(flinkapplicationsResource, c.ns, name, pt, data, subresources...), &v1beta1.FlinkApplication{}) 109 | 110 | if obj == nil { 111 | return nil, err 112 | } 113 | return obj.(*v1beta1.FlinkApplication), err 114 | } 115 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/typed/app/v1beta1/flinkapplication.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package v1beta1 4 | 5 | import ( 6 | "context" 7 | "time" 8 | 9 | v1beta1 "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 10 | scheme "github.com/lyft/flinkk8soperator/pkg/client/clientset/versioned/scheme" 11 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 12 | types "k8s.io/apimachinery/pkg/types" 13 | watch "k8s.io/apimachinery/pkg/watch" 14 | rest "k8s.io/client-go/rest" 15 | ) 16 | 17 | // FlinkApplicationsGetter has a method to return a FlinkApplicationInterface. 18 | // A group's client should implement this interface. 19 | type FlinkApplicationsGetter interface { 20 | FlinkApplications(namespace string) FlinkApplicationInterface 21 | } 22 | 23 | // FlinkApplicationInterface has methods to work with FlinkApplication resources. 24 | type FlinkApplicationInterface interface { 25 | Create(ctx context.Context, flinkApplication *v1beta1.FlinkApplication, opts v1.CreateOptions) (*v1beta1.FlinkApplication, error) 26 | Update(ctx context.Context, flinkApplication *v1beta1.FlinkApplication, opts v1.UpdateOptions) (*v1beta1.FlinkApplication, error) 27 | Delete(ctx context.Context, name string, opts v1.DeleteOptions) error 28 | DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error 29 | Get(ctx context.Context, name string, opts v1.GetOptions) (*v1beta1.FlinkApplication, error) 30 | List(ctx context.Context, opts v1.ListOptions) (*v1beta1.FlinkApplicationList, error) 31 | Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) 32 | Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *v1beta1.FlinkApplication, err error) 33 | FlinkApplicationExpansion 34 | } 35 | 36 | // flinkApplications implements FlinkApplicationInterface 37 | type flinkApplications struct { 38 | client rest.Interface 39 | ns string 40 | } 41 | 42 | // newFlinkApplications returns a FlinkApplications 43 | func newFlinkApplications(c *FlinkV1beta1Client, namespace string) *flinkApplications { 44 | return &flinkApplications{ 45 | client: c.RESTClient(), 46 | ns: namespace, 47 | } 48 | } 49 | 50 | // Get takes name of the flinkApplication, and returns the corresponding flinkApplication object, and an error if there is any. 51 | func (c *flinkApplications) Get(ctx context.Context, name string, options v1.GetOptions) (result *v1beta1.FlinkApplication, err error) { 52 | result = &v1beta1.FlinkApplication{} 53 | err = c.client.Get(). 54 | Namespace(c.ns). 55 | Resource("flinkapplications"). 56 | Name(name). 57 | VersionedParams(&options, scheme.ParameterCodec). 58 | Do(ctx). 59 | Into(result) 60 | return 61 | } 62 | 63 | // List takes label and field selectors, and returns the list of FlinkApplications that match those selectors. 64 | func (c *flinkApplications) List(ctx context.Context, opts v1.ListOptions) (result *v1beta1.FlinkApplicationList, err error) { 65 | var timeout time.Duration 66 | if opts.TimeoutSeconds != nil { 67 | timeout = time.Duration(*opts.TimeoutSeconds) * time.Second 68 | } 69 | result = &v1beta1.FlinkApplicationList{} 70 | err = c.client.Get(). 71 | Namespace(c.ns). 72 | Resource("flinkapplications"). 73 | VersionedParams(&opts, scheme.ParameterCodec). 74 | Timeout(timeout). 75 | Do(ctx). 76 | Into(result) 77 | return 78 | } 79 | 80 | // Watch returns a watch.Interface that watches the requested flinkApplications. 81 | func (c *flinkApplications) Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) { 82 | var timeout time.Duration 83 | if opts.TimeoutSeconds != nil { 84 | timeout = time.Duration(*opts.TimeoutSeconds) * time.Second 85 | } 86 | opts.Watch = true 87 | return c.client.Get(). 88 | Namespace(c.ns). 89 | Resource("flinkapplications"). 90 | VersionedParams(&opts, scheme.ParameterCodec). 91 | Timeout(timeout). 92 | Watch(ctx) 93 | } 94 | 95 | // Create takes the representation of a flinkApplication and creates it. Returns the server's representation of the flinkApplication, and an error, if there is any. 96 | func (c *flinkApplications) Create(ctx context.Context, flinkApplication *v1beta1.FlinkApplication, opts v1.CreateOptions) (result *v1beta1.FlinkApplication, err error) { 97 | result = &v1beta1.FlinkApplication{} 98 | err = c.client.Post(). 99 | Namespace(c.ns). 100 | Resource("flinkapplications"). 101 | VersionedParams(&opts, scheme.ParameterCodec). 102 | Body(flinkApplication). 103 | Do(ctx). 104 | Into(result) 105 | return 106 | } 107 | 108 | // Update takes the representation of a flinkApplication and updates it. Returns the server's representation of the flinkApplication, and an error, if there is any. 109 | func (c *flinkApplications) Update(ctx context.Context, flinkApplication *v1beta1.FlinkApplication, opts v1.UpdateOptions) (result *v1beta1.FlinkApplication, err error) { 110 | result = &v1beta1.FlinkApplication{} 111 | err = c.client.Put(). 112 | Namespace(c.ns). 113 | Resource("flinkapplications"). 114 | Name(flinkApplication.Name). 115 | VersionedParams(&opts, scheme.ParameterCodec). 116 | Body(flinkApplication). 117 | Do(ctx). 118 | Into(result) 119 | return 120 | } 121 | 122 | // Delete takes name of the flinkApplication and deletes it. Returns an error if one occurs. 123 | func (c *flinkApplications) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error { 124 | return c.client.Delete(). 125 | Namespace(c.ns). 126 | Resource("flinkapplications"). 127 | Name(name). 128 | Body(&opts). 129 | Do(ctx). 130 | Error() 131 | } 132 | 133 | // DeleteCollection deletes a collection of objects. 134 | func (c *flinkApplications) DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error { 135 | var timeout time.Duration 136 | if listOpts.TimeoutSeconds != nil { 137 | timeout = time.Duration(*listOpts.TimeoutSeconds) * time.Second 138 | } 139 | return c.client.Delete(). 140 | Namespace(c.ns). 141 | Resource("flinkapplications"). 142 | VersionedParams(&listOpts, scheme.ParameterCodec). 143 | Timeout(timeout). 144 | Body(&opts). 145 | Do(ctx). 146 | Error() 147 | } 148 | 149 | // Patch applies the patch and returns the patched flinkApplication. 150 | func (c *flinkApplications) Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *v1beta1.FlinkApplication, err error) { 151 | result = &v1beta1.FlinkApplication{} 152 | err = c.client.Patch(pt). 153 | Namespace(c.ns). 154 | Resource("flinkapplications"). 155 | Name(name). 156 | SubResource(subresources...). 157 | VersionedParams(&opts, scheme.ParameterCodec). 158 | Body(data). 159 | Do(ctx). 160 | Into(result) 161 | return 162 | } 163 | -------------------------------------------------------------------------------- /pkg/client/clientset/versioned/typed/app/v1beta1/generated_expansion.go: -------------------------------------------------------------------------------- 1 | // Code generated by client-gen. DO NOT EDIT. 2 | 3 | package v1beta1 4 | 5 | type FlinkApplicationExpansion interface{} 6 | -------------------------------------------------------------------------------- /pkg/controller/add_flinkapplication.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | "github.com/lyft/flinkk8soperator/pkg/controller/flinkapplication" 5 | ) 6 | 7 | func init() { 8 | // AddToManagerFuncs is a list of functions to create controllers and add them to a manager. 9 | AddToManagerFuncs = append(AddToManagerFuncs, flinkapplication.Add) 10 | } 11 | -------------------------------------------------------------------------------- /pkg/controller/common/utils.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "github.com/lyft/flytestdlib/contextutils" 5 | 6 | appsv1 "k8s.io/api/apps/v1" 7 | v1 "k8s.io/api/core/v1" 8 | ) 9 | 10 | func GetValidLabelNames() []contextutils.Key { 11 | return []contextutils.Key{ 12 | contextutils.AppNameKey, 13 | contextutils.NamespaceKey, 14 | } 15 | } 16 | 17 | func DuplicateMap(o map[string]string) (r map[string]string) { 18 | if o == nil { 19 | return map[string]string{} 20 | } 21 | r = make(map[string]string, len(o)) 22 | for k, v := range o { 23 | r[k] = v 24 | } 25 | return 26 | } 27 | 28 | func CopyMap(to map[string]string, from map[string]string) map[string]string { 29 | if len(to) == 0 && len(from) == 0 { 30 | return to 31 | } 32 | if len(from) == 0 { 33 | return to 34 | } 35 | if len(to) == 0 { 36 | to = make(map[string]string, len(from)) 37 | } 38 | for k, v := range from { 39 | to[k] = v 40 | } 41 | return to 42 | } 43 | 44 | func GetEnvVar(envs []v1.EnvVar, name string) *v1.EnvVar { 45 | for _, v := range envs { 46 | if v.Name == name { 47 | return &v 48 | } 49 | } 50 | 51 | return nil 52 | } 53 | 54 | type FlinkDeployment struct { 55 | Jobmanager *appsv1.Deployment 56 | Taskmanager *appsv1.Deployment 57 | Hash string 58 | } 59 | -------------------------------------------------------------------------------- /pkg/controller/config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "github.com/lyft/flytestdlib/config" 5 | ) 6 | 7 | //go:generate pflags Config 8 | 9 | const AppName = "flinkK8sOperator" 10 | const configSectionKey = "operator" 11 | 12 | var ConfigSection = config.MustRegisterSection(configSectionKey, &Config{}) 13 | 14 | type Config struct { 15 | ResyncPeriod config.Duration `json:"resyncPeriod" pflag:"\"30s\",Determines the resync period for all watchers."` 16 | LimitNamespace string `json:"limitNamespace" pflag:"\"\",Namespaces to watch for by flink operator"` 17 | MetricsPrefix string `json:"metricsPrefix" pflag:"\"flinkk8soperator\",Prefix for metrics propagated to prometheus"` 18 | ProfilerPort config.Port `json:"profilerPort" pflag:"\"10254\",Profiler port"` 19 | FlinkIngressURLFormat string `json:"ingressUrlFormat"` 20 | UseProxy bool `json:"useKubectlProxy"` 21 | ProxyPort config.Port `json:"proxyPort" pflag:"\"8001\",The port at which flink cluster runs locally"` 22 | ContainerNameFormat string `json:"containerNameFormat"` 23 | Workers int `json:"workers" pflag:"4,Number of routines to process custom resource"` 24 | BaseBackoffDuration config.Duration `json:"baseBackoffDuration" pflag:"\"100ms\",Determines the base backoff for exponential retries."` 25 | MaxBackoffDuration config.Duration `json:"maxBackoffDuration" pflag:"\"30s\",Determines the max backoff for exponential retries."` 26 | MaxErrDuration config.Duration `json:"maxErrDuration" pflag:"\"5m\",Determines the max time to wait on errors."` 27 | FlinkJobVertexTimeout config.Duration `json:"flinkJobVertexTimeout" pflag:"\"3m\",Determines the max time to wait on job vertex state turns into RUNNING."` 28 | SchedulerName string `json:"schedulerName"` 29 | } 30 | 31 | func GetConfig() *Config { 32 | return ConfigSection.GetConfig().(*Config) 33 | } 34 | 35 | func SetConfig(c *Config) error { 36 | return ConfigSection.SetConfig(c) 37 | } 38 | -------------------------------------------------------------------------------- /pkg/controller/config/config_flags.go: -------------------------------------------------------------------------------- 1 | // Code generated by go generate; DO NOT EDIT. 2 | // This file was generated by robots. 3 | 4 | package config 5 | 6 | import ( 7 | "encoding/json" 8 | "reflect" 9 | 10 | "fmt" 11 | 12 | "github.com/spf13/pflag" 13 | ) 14 | 15 | // If v is a pointer, it will get its element value or the zero value of the element type. 16 | // If v is not a pointer, it will return it as is. 17 | func (Config) elemValueOrNil(v interface{}) interface{} { 18 | if t := reflect.TypeOf(v); t.Kind() == reflect.Ptr { 19 | if reflect.ValueOf(v).IsNil() { 20 | return reflect.Zero(t.Elem()).Interface() 21 | } else { 22 | return reflect.ValueOf(v).Interface() 23 | } 24 | } else if v == nil { 25 | return reflect.Zero(t).Interface() 26 | } 27 | 28 | return v 29 | } 30 | 31 | func (Config) mustMarshalJSON(v json.Marshaler) string { 32 | raw, err := v.MarshalJSON() 33 | if err != nil { 34 | panic(err) 35 | } 36 | 37 | return string(raw) 38 | } 39 | 40 | // GetPFlagSet will return strongly types pflags for all fields in Config and its nested types. The format of the 41 | // flags is json-name.json-sub-name... etc. 42 | func (cfg Config) GetPFlagSet(prefix string) *pflag.FlagSet { 43 | cmdFlags := pflag.NewFlagSet("Config", pflag.ExitOnError) 44 | cmdFlags.String(fmt.Sprintf("%v%v", prefix, "resyncPeriod"), "30s", "Determines the resync period for all watchers.") 45 | cmdFlags.String(fmt.Sprintf("%v%v", prefix, "limitNamespace"), "", "Namespaces to watch for by flink operator") 46 | cmdFlags.String(fmt.Sprintf("%v%v", prefix, "metricsPrefix"), "flinkk8soperator", "Prefix for metrics propagated to prometheus") 47 | cmdFlags.String(fmt.Sprintf("%v%v", prefix, "profilerPort"), "10254", "Profiler port") 48 | cmdFlags.String(fmt.Sprintf("%v%v", prefix, "ingressUrlFormat"), *new(string), "") 49 | cmdFlags.Bool(fmt.Sprintf("%v%v", prefix, "useKubectlProxy"), *new(bool), "") 50 | cmdFlags.String(fmt.Sprintf("%v%v", prefix, "proxyPort"), "8001", "The port at which flink cluster runs locally") 51 | cmdFlags.String(fmt.Sprintf("%v%v", prefix, "containerNameFormat"), *new(string), "") 52 | cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "workers"), 4, "Number of routines to process custom resource") 53 | cmdFlags.String(fmt.Sprintf("%v%v", prefix, "baseBackoffDuration"), "100ms", "Determines the base backoff for exponential retries.") 54 | cmdFlags.String(fmt.Sprintf("%v%v", prefix, "maxBackoffDuration"), "30s", "Determines the max backoff for exponential retries.") 55 | cmdFlags.String(fmt.Sprintf("%v%v", prefix, "maxErrDuration"), "5m", "Determines the max time to wait on errors.") 56 | cmdFlags.String(fmt.Sprintf("%v%v", prefix, "flinkJobVertexTimeout"), "3m", "Determines the max time to wait on job vertex state turns into RUNNING.") 57 | cmdFlags.String(fmt.Sprintf("%v%v", prefix, "schedulerName"), *new(string), "") 58 | return cmdFlags 59 | } 60 | -------------------------------------------------------------------------------- /pkg/controller/config/runtime_config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "github.com/lyft/flytestdlib/promutils" 5 | ) 6 | 7 | type RuntimeConfig struct { 8 | MetricsScope promutils.Scope 9 | } 10 | -------------------------------------------------------------------------------- /pkg/controller/controller.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/lyft/flinkk8soperator/pkg/controller/config" 7 | "sigs.k8s.io/controller-runtime/pkg/manager" 8 | ) 9 | 10 | // AddToManagerFuncs is a list of functions to add all Controllers to the Manager 11 | var AddToManagerFuncs []func(context.Context, manager.Manager, config.RuntimeConfig) error 12 | 13 | // AddToManager adds all Controllers to the Manager 14 | func AddToManager(ctx context.Context, m manager.Manager, runtimeCfg config.RuntimeConfig) error { 15 | for _, f := range AddToManagerFuncs { 16 | if err := f(ctx, m, runtimeCfg); err != nil { 17 | return err 18 | } 19 | } 20 | return nil 21 | } 22 | -------------------------------------------------------------------------------- /pkg/controller/errors/codes.go: -------------------------------------------------------------------------------- 1 | package errors 2 | 3 | type ErrorCode = string 4 | 5 | const ( 6 | IllegalStateError ErrorCode = "IllegalStateError" 7 | CausedByError ErrorCode = "CausedByError" 8 | BadJobSpecificationError ErrorCode = "BadJobSpecificationError" 9 | ReconciliationNeeded ErrorCode = "ReconciliationNeeded" 10 | ) 11 | -------------------------------------------------------------------------------- /pkg/controller/errors/error.go: -------------------------------------------------------------------------------- 1 | package errors 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | type ErrorMessage = string 8 | 9 | type FlinkOperatorError struct { 10 | Code ErrorCode 11 | Message ErrorMessage 12 | } 13 | 14 | func (w *FlinkOperatorError) Error() string { 15 | return fmt.Sprintf("ErrorCode: [%v] Reason: [%v]", w.Code, w.Message) 16 | } 17 | 18 | type FlinkOperatorErrorWithCause struct { 19 | *FlinkOperatorError 20 | cause error 21 | } 22 | 23 | func (w *FlinkOperatorErrorWithCause) Error() string { 24 | return fmt.Sprintf("%v. Caused By [%v]", w.FlinkOperatorError.Error(), w.cause) 25 | } 26 | 27 | func (w *FlinkOperatorErrorWithCause) Cause() error { 28 | return w.cause 29 | } 30 | 31 | func errorf(c ErrorCode, msgFmt string, args ...interface{}) *FlinkOperatorError { 32 | return &FlinkOperatorError{ 33 | Code: c, 34 | Message: fmt.Sprintf(msgFmt, args...), 35 | } 36 | } 37 | 38 | func Errorf(c ErrorCode, msgFmt string, args ...interface{}) error { 39 | return errorf(c, msgFmt, args...) 40 | } 41 | 42 | func WrapErrorf(c ErrorCode, cause error, msgFmt string, args ...interface{}) error { 43 | return &FlinkOperatorErrorWithCause{ 44 | FlinkOperatorError: errorf(c, msgFmt, args...), 45 | cause: cause, 46 | } 47 | } 48 | 49 | func IsReconciliationNeeded(err error) bool { 50 | if fErr, ok := err.(*FlinkOperatorError); ok { 51 | if fErr.Code == ReconciliationNeeded { 52 | return true 53 | } 54 | } 55 | return false 56 | } 57 | -------------------------------------------------------------------------------- /pkg/controller/flink/client/entities.go: -------------------------------------------------------------------------------- 1 | package client 2 | 3 | type SavepointStatus string 4 | 5 | const ( 6 | SavePointInvalid SavepointStatus = "" 7 | SavePointInProgress SavepointStatus = "IN_PROGRESS" 8 | SavePointCompleted SavepointStatus = "COMPLETED" 9 | ) 10 | 11 | type CheckpointStatus string 12 | 13 | const ( 14 | CheckpointInProgress CheckpointStatus = "IN_PROGRESS" 15 | CheckpointFailed CheckpointStatus = "FAILED" 16 | CheckpointCompleted CheckpointStatus = "COMPLETED" 17 | ) 18 | 19 | type JobState string 20 | 21 | const ( 22 | Created JobState = "CREATED" 23 | Running JobState = "RUNNING" 24 | Failing JobState = "FAILING" 25 | Failed JobState = "FAILED" 26 | Cancelling JobState = "CANCELLING" 27 | Canceled JobState = "CANCELED" 28 | Finished JobState = "FINISHED" 29 | Restarting JobState = "RESTARTING" 30 | Suspended JobState = "SUSPENDED" 31 | Reconciling JobState = "RECONCILING" 32 | ) 33 | 34 | type SavepointJobRequest struct { 35 | CancelJob bool `json:"cancel-job"` 36 | TargetDirectory string `json:"target-directory,omitempty"` 37 | } 38 | 39 | type SubmitJobRequest struct { 40 | SavepointPath string `json:"savepointPath,omitempty"` 41 | Parallelism int32 `json:"parallelism,omitempty"` 42 | ProgramArgs string `json:"programArgs,omitempty"` 43 | EntryClass string `json:"entryClass,omitempty"` 44 | AllowNonRestoredState bool `json:"allowNonRestoredState,omitempty"` 45 | } 46 | 47 | type SavepointResponse struct { 48 | SavepointStatus SavepointStatusResponse `json:"status"` 49 | Operation SavepointOperationResponse `json:"operation"` 50 | } 51 | 52 | type SavepointStatusResponse struct { 53 | Status SavepointStatus `json:"id"` 54 | } 55 | 56 | type SavepointOperationResponse struct { 57 | Location string `json:"location"` 58 | FailureCause FailureCause `json:"failure-cause"` 59 | } 60 | 61 | type FailureCause struct { 62 | Class string `json:"class"` 63 | StackTrace string `json:"stack-trace"` 64 | } 65 | 66 | type SavepointJobResponse struct { 67 | TriggerID string `json:"request-id"` 68 | } 69 | 70 | type SubmitJobResponse struct { 71 | JobID string `json:"jobid"` 72 | } 73 | 74 | type GetJobsResponse struct { 75 | Jobs []FlinkJob `json:"jobs"` 76 | } 77 | 78 | type JobConfigResponse struct { 79 | JobID string `json:"jid"` 80 | ExecutionConfig JobExecutionConfig `json:"execution-config"` 81 | } 82 | 83 | type JobExecutionConfig struct { 84 | Parallelism int32 `json:"job-parallelism"` 85 | } 86 | 87 | type FlinkJob struct { 88 | JobID string `json:"id"` 89 | Status JobState `json:"status"` 90 | } 91 | 92 | type FlinkJobVertex struct { 93 | ID string `json:"id"` 94 | Name string `json:"name"` 95 | Parallelism int64 `json:"parallelism"` 96 | Status JobState `json:"status"` 97 | StartTime int64 `json:"start-time"` 98 | EndTime int64 `json:"end-time"` 99 | Duration int64 `json:"duration"` 100 | Tasks map[string]int64 `json:"tasks"` 101 | Metrics map[string]interface{} `json:"metrics"` 102 | } 103 | 104 | type FlinkJobOverview struct { 105 | JobID string `json:"jid"` 106 | State JobState `json:"state"` 107 | StartTime int64 `json:"start-time"` 108 | EndTime int64 `json:"end-time"` 109 | Vertices []FlinkJobVertex `json:"vertices"` 110 | } 111 | 112 | type ClusterOverviewResponse struct { 113 | TaskManagerCount int32 `json:"taskmanagers"` 114 | SlotsAvailable int32 `json:"slots-available"` 115 | NumberOfTaskSlots int32 `json:"slots-total"` 116 | } 117 | 118 | type CheckpointStatistics struct { 119 | ID uint `json:"id"` 120 | Status CheckpointStatus `json:"status"` 121 | IsSavepoint bool `json:"is_savepoint"` 122 | TriggerTimestamp int64 `json:"trigger_timestamp"` 123 | LatestAckTimestamp int64 `json:"latest_ack_timestamp"` 124 | StateSize int64 `json:"state_size"` 125 | EndToEndDuration int64 `json:"end_to_end_duration"` 126 | AlignmentBuffered int64 `json:"alignment_buffered"` 127 | NumSubtasks int64 `json:"num_subtasks"` 128 | FailureTimestamp int64 `json:"failure_timestamp"` 129 | FailureMessage string `json:"failure_message"` 130 | ExternalPath string `json:"external_path"` 131 | Discarded bool `json:"discarded"` 132 | RestoredTimeStamp int64 `json:"restore_timestamp"` 133 | } 134 | 135 | type LatestCheckpoints struct { 136 | Completed *CheckpointStatistics `json:"completed,omitempty"` 137 | Savepoint *CheckpointStatistics `json:"savepoint,omitempty"` 138 | Failed *CheckpointStatistics `json:"failed,omitempty"` 139 | Restored *CheckpointStatistics `json:"restored,omitempty"` 140 | } 141 | 142 | type CheckpointResponse struct { 143 | Counts map[string]int32 `json:"counts"` 144 | Latest LatestCheckpoints `json:"latest"` 145 | History []CheckpointStatistics `json:"history"` 146 | } 147 | 148 | type TaskManagerStats struct { 149 | Path string `json:"path"` 150 | DataPort int32 `json:"dataPort"` 151 | TimeSinceLastHeartbeat int64 `json:"timeSinceLastHeartbeat"` 152 | SlotsNumber int32 `json:"slotsNumber"` 153 | FreeSlots int32 `json:"freeSlots"` 154 | } 155 | 156 | type TaskManagersResponse struct { 157 | TaskManagers []TaskManagerStats `json:"taskmanagers"` 158 | } 159 | -------------------------------------------------------------------------------- /pkg/controller/flink/client/error_handler.go: -------------------------------------------------------------------------------- 1 | package client 2 | 3 | import ( 4 | "fmt" 5 | "math/rand" 6 | "time" 7 | 8 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | 10 | "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 11 | 12 | "github.com/pkg/errors" 13 | "k8s.io/utils/clock" 14 | ) 15 | 16 | // appError codes 17 | const ( 18 | GlobalFailure = "FAILED" 19 | JSONUnmarshalError = "JSONUNMARSHALERROR" 20 | DefaultRetries = 20 21 | NoRetries = 0 22 | ) 23 | 24 | func GetRetryableError(err error, method v1beta1.FlinkMethod, errorCode string, maxRetries int32) error { 25 | return GetRetryableErrorWithMessage(err, method, errorCode, maxRetries, "") 26 | } 27 | 28 | func GetRetryableErrorWithMessage(err error, method v1beta1.FlinkMethod, errorCode string, maxRetries int32, message string) error { 29 | appError := getErrorValue(err, method, errorCode, message) 30 | return NewFlinkApplicationError(appError.Error(), method, errorCode, true, false, maxRetries) 31 | } 32 | 33 | func GetNonRetryableError(err error, method v1beta1.FlinkMethod, errorCode string) error { 34 | return GetNonRetryableErrorWithMessage(err, method, errorCode, "") 35 | } 36 | 37 | func GetNonRetryableErrorWithMessage(err error, method v1beta1.FlinkMethod, errorCode string, message string) error { 38 | appError := getErrorValue(err, method, errorCode, message) 39 | return NewFlinkApplicationError(appError.Error(), method, errorCode, false, true, NoRetries) 40 | } 41 | 42 | func getErrorValue(err error, method v1beta1.FlinkMethod, errorCode string, message string) error { 43 | if err == nil { 44 | return errors.New(fmt.Sprintf("%v call failed with status %v and message '%s'", method, errorCode, message)) 45 | } 46 | return errors.Wrapf(err, "%v call failed with status %v and message '%s'", method, errorCode, message) 47 | } 48 | 49 | func min(a, b int) int { 50 | if a < b { 51 | return a 52 | } 53 | return b 54 | } 55 | 56 | type RetryHandlerInterface interface { 57 | IsErrorRetryable(err error) bool 58 | IsRetryRemaining(err error, retryCount int32) bool 59 | WaitOnError(clock clock.Clock, lastUpdatedTime time.Time) (time.Duration, bool) 60 | GetRetryDelay(retryCount int32) time.Duration 61 | IsTimeToRetry(clock clock.Clock, lastUpdatedTime time.Time, retryCount int32) bool 62 | } 63 | 64 | // A Retryer that has methods to determine if an error is retryable and also does exponential backoff 65 | type RetryHandler struct { 66 | baseBackOffDuration time.Duration 67 | maxErrDuration time.Duration 68 | maxBackOffMillisDuration time.Duration 69 | } 70 | 71 | func NewRetryHandler(baseBackoff time.Duration, timeToWait time.Duration, maxBackOff time.Duration) RetryHandler { 72 | return RetryHandler{baseBackoff, timeToWait, maxBackOff} 73 | } 74 | func (r RetryHandler) IsErrorRetryable(err error) bool { 75 | if err == nil { 76 | return false 77 | } 78 | flinkAppError, ok := err.(*v1beta1.FlinkApplicationError) 79 | if ok && flinkAppError != nil { 80 | return flinkAppError.IsRetryable 81 | } 82 | 83 | return false 84 | } 85 | 86 | func (r RetryHandler) IsRetryRemaining(err error, retryCount int32) bool { 87 | flinkAppError, ok := err.(*v1beta1.FlinkApplicationError) 88 | if ok && flinkAppError != nil { 89 | return retryCount <= flinkAppError.MaxRetries 90 | } 91 | 92 | return false 93 | } 94 | 95 | func (r RetryHandler) WaitOnError(clock clock.Clock, lastUpdatedTime time.Time) (time.Duration, bool) { 96 | elapsedTime := clock.Since(lastUpdatedTime) 97 | return elapsedTime, elapsedTime <= r.maxErrDuration 98 | 99 | } 100 | func (r RetryHandler) GetRetryDelay(retryCount int32) time.Duration { 101 | timeInMillis := int(r.baseBackOffDuration.Nanoseconds() / int64(time.Millisecond)) 102 | if timeInMillis <= 0 { 103 | timeInMillis = 1 104 | } 105 | maxBackoffMillis := int(r.maxBackOffMillisDuration.Nanoseconds() / int64(time.Millisecond)) 106 | delay := 1 << uint(retryCount) * (rand.Intn(timeInMillis) + timeInMillis) // nolint: gosec 107 | return time.Duration(min(delay, maxBackoffMillis)) * time.Millisecond 108 | } 109 | func (r RetryHandler) IsTimeToRetry(clock clock.Clock, lastUpdatedTime time.Time, retryCount int32) bool { 110 | elapsedTime := clock.Since(lastUpdatedTime) 111 | return elapsedTime >= r.GetRetryDelay(retryCount) 112 | } 113 | 114 | func NewFlinkApplicationError(appError string, method v1beta1.FlinkMethod, errorCode string, isRetryable bool, isFailFast bool, maxRetries int32) *v1beta1.FlinkApplicationError { 115 | now := v1.Now() 116 | return &v1beta1.FlinkApplicationError{AppError: appError, Method: method, ErrorCode: errorCode, IsRetryable: isRetryable, IsFailFast: isFailFast, MaxRetries: maxRetries, LastErrorUpdateTime: &now} 117 | } 118 | -------------------------------------------------------------------------------- /pkg/controller/flink/client/error_handler_test.go: -------------------------------------------------------------------------------- 1 | package client 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | clockTesting "k8s.io/utils/clock/testing" 8 | 9 | "github.com/pkg/errors" 10 | "github.com/stretchr/testify/assert" 11 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 12 | ) 13 | 14 | func getTestRetryer() RetryHandler { 15 | return NewRetryHandler(10*time.Millisecond, 10*time.Millisecond, 50*time.Millisecond) 16 | } 17 | 18 | func TestGetError(t *testing.T) { 19 | testErr := errors.New("Service unavailable") 20 | ferr := GetNonRetryableError(testErr, "GetTest", "500") 21 | assert.Equal(t, "GetTest call failed with status 500 and message '': Service unavailable", ferr.Error()) 22 | 23 | //nil error 24 | ferrNil := GetNonRetryableError(nil, "GetTest", "500") 25 | assert.Equal(t, "GetTest call failed with status 500 and message ''", ferrNil.Error()) 26 | 27 | testWrappedErr := errors.Wrap(testErr, "Wrapped errors") 28 | ferrWrapped := GetNonRetryableError(testWrappedErr, "GetTestWrapped", "400") 29 | assert.Equal(t, "GetTestWrapped call failed with status 400 and message '': Wrapped errors: Service unavailable", ferrWrapped.Error()) 30 | 31 | testMessageErr := errors.New("Test Error") 32 | ferrMessage := GetNonRetryableErrorWithMessage(testMessageErr, "GetTest", "500", "message") 33 | assert.Equal(t, "GetTest call failed with status 500 and message 'message': Test Error", ferrMessage.Error()) 34 | } 35 | 36 | func TestErrors(t *testing.T) { 37 | retryableError := errors.New("GetClusterOverview500") 38 | ferr := GetRetryableError(retryableError, "GetTest", "500", DefaultRetries) 39 | retryer := getTestRetryer() 40 | assert.True(t, retryer.IsErrorRetryable(ferr)) 41 | 42 | failFastError := errors.New("SubmitJob400BadRequest") 43 | ferr = GetNonRetryableError(failFastError, "GetTest", "500") 44 | assert.False(t, retryer.IsErrorRetryable(ferr)) 45 | } 46 | 47 | func TestRetryHandler_GetRetryDelay(t *testing.T) { 48 | retryHandler := getTestRetryer() 49 | assert.True(t, retryHandler.GetRetryDelay(20) <= 50*time.Millisecond) 50 | assert.True(t, retryHandler.GetRetryDelay(1) <= 50*time.Millisecond) 51 | assert.True(t, retryHandler.GetRetryDelay(200) <= 50*time.Millisecond) 52 | } 53 | 54 | func TestRetryHandler_IsRetryRemaining(t *testing.T) { 55 | retryableError := errors.New("GetClusterOverview500") 56 | ferr := GetRetryableError(retryableError, "GetTest", "500", DefaultRetries) 57 | retryer := getTestRetryer() 58 | assert.True(t, retryer.IsRetryRemaining(ferr, 2)) 59 | assert.False(t, retryer.IsRetryRemaining(ferr, 22)) 60 | } 61 | 62 | func TestRetryHandler_IsTimeToRetry(t *testing.T) { 63 | retryer := getTestRetryer() 64 | currTime := metav1.NewTime(time.Now()) 65 | olderTime := currTime.Add(-5 * time.Second) 66 | fakeClock := clockTesting.NewFakeClock(currTime.Time) 67 | fakeClock.SetTime(time.Now()) 68 | // Set retry count to 0 to keep retry delay small 69 | assert.True(t, retryer.IsTimeToRetry(fakeClock, olderTime, 0)) 70 | } 71 | -------------------------------------------------------------------------------- /pkg/controller/flink/client/mock/mock_api.go: -------------------------------------------------------------------------------- 1 | package mock 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/lyft/flinkk8soperator/pkg/controller/flink/client" 7 | ) 8 | 9 | type CancelJobWithSavepointFunc func(ctx context.Context, url string, jobID string) (string, error) 10 | type ForceCancelJobFunc func(ctx context.Context, url string, jobID string) error 11 | type SubmitJobFunc func(ctx context.Context, url string, jarID string, submitJobRequest client.SubmitJobRequest) (*client.SubmitJobResponse, error) 12 | type CheckSavepointStatusFunc func(ctx context.Context, url string, jobID, triggerID string) (*client.SavepointResponse, error) 13 | type GetJobsFunc func(ctx context.Context, url string) (*client.GetJobsResponse, error) 14 | type GetClusterOverviewFunc func(ctx context.Context, url string) (*client.ClusterOverviewResponse, error) 15 | type GetLatestCheckpointFunc func(ctx context.Context, url string, jobID string) (*client.CheckpointStatistics, error) 16 | type GetJobConfigFunc func(ctx context.Context, url string, jobID string) (*client.JobConfigResponse, error) 17 | type GetTaskManagersFunc func(ctx context.Context, url string) (*client.TaskManagersResponse, error) 18 | type GetCheckpointCountsFunc func(ctx context.Context, url string, jobID string) (*client.CheckpointResponse, error) 19 | type GetJobOverviewFunc func(ctx context.Context, url string, jobID string) (*client.FlinkJobOverview, error) 20 | type SavepointJobFunc func(ctx context.Context, url string, jobID string) (string, error) 21 | type JobManagerClient struct { 22 | CancelJobWithSavepointFunc CancelJobWithSavepointFunc 23 | ForceCancelJobFunc ForceCancelJobFunc 24 | SubmitJobFunc SubmitJobFunc 25 | CheckSavepointStatusFunc CheckSavepointStatusFunc 26 | GetJobsFunc GetJobsFunc 27 | GetClusterOverviewFunc GetClusterOverviewFunc 28 | GetJobConfigFunc GetJobConfigFunc 29 | GetLatestCheckpointFunc GetLatestCheckpointFunc 30 | GetTaskManagersFunc GetTaskManagersFunc 31 | GetCheckpointCountsFunc GetCheckpointCountsFunc 32 | GetJobOverviewFunc GetJobOverviewFunc 33 | SavepointJobFunc SavepointJobFunc 34 | } 35 | 36 | func (m *JobManagerClient) SubmitJob(ctx context.Context, url string, jarID string, submitJobRequest client.SubmitJobRequest) (*client.SubmitJobResponse, error) { 37 | if m.SubmitJobFunc != nil { 38 | return m.SubmitJobFunc(ctx, url, jarID, submitJobRequest) 39 | } 40 | return nil, nil 41 | } 42 | 43 | func (m *JobManagerClient) CancelJobWithSavepoint(ctx context.Context, url string, jobID string) (string, error) { 44 | if m.CancelJobWithSavepointFunc != nil { 45 | return m.CancelJobWithSavepointFunc(ctx, url, jobID) 46 | } 47 | return "", nil 48 | } 49 | 50 | func (m *JobManagerClient) ForceCancelJob(ctx context.Context, url string, jobID string) error { 51 | if m.ForceCancelJobFunc != nil { 52 | return m.ForceCancelJobFunc(ctx, url, jobID) 53 | } 54 | return nil 55 | } 56 | 57 | func (m *JobManagerClient) CheckSavepointStatus(ctx context.Context, url string, jobID, triggerID string) (*client.SavepointResponse, error) { 58 | if m.CheckSavepointStatusFunc != nil { 59 | return m.CheckSavepointStatusFunc(ctx, url, jobID, triggerID) 60 | } 61 | return nil, nil 62 | } 63 | 64 | func (m *JobManagerClient) GetJobs(ctx context.Context, url string) (*client.GetJobsResponse, error) { 65 | if m.GetJobsFunc != nil { 66 | return m.GetJobsFunc(ctx, url) 67 | } 68 | return nil, nil 69 | } 70 | 71 | func (m *JobManagerClient) GetClusterOverview(ctx context.Context, url string) (*client.ClusterOverviewResponse, error) { 72 | if m.GetClusterOverviewFunc != nil { 73 | return m.GetClusterOverviewFunc(ctx, url) 74 | } 75 | return nil, nil 76 | } 77 | 78 | func (m *JobManagerClient) GetJobConfig(ctx context.Context, url string, jobID string) (*client.JobConfigResponse, error) { 79 | if m.GetJobConfigFunc != nil { 80 | return m.GetJobConfigFunc(ctx, url, jobID) 81 | } 82 | return nil, nil 83 | } 84 | 85 | func (m *JobManagerClient) GetLatestCheckpoint(ctx context.Context, url string, jobID string) (*client.CheckpointStatistics, error) { 86 | if m.GetLatestCheckpointFunc != nil { 87 | return m.GetLatestCheckpointFunc(ctx, url, jobID) 88 | } 89 | return nil, nil 90 | } 91 | 92 | func (m *JobManagerClient) GetTaskManagers(ctx context.Context, url string) (*client.TaskManagersResponse, error) { 93 | if m.GetTaskManagersFunc != nil { 94 | return m.GetTaskManagersFunc(ctx, url) 95 | } 96 | return nil, nil 97 | } 98 | 99 | func (m *JobManagerClient) GetCheckpointCounts(ctx context.Context, url string, jobID string) (*client.CheckpointResponse, error) { 100 | if m.GetCheckpointCountsFunc != nil { 101 | return m.GetCheckpointCountsFunc(ctx, url, jobID) 102 | } 103 | return nil, nil 104 | } 105 | 106 | func (m *JobManagerClient) GetJobOverview(ctx context.Context, url string, jobID string) (*client.FlinkJobOverview, error) { 107 | if m.GetJobOverviewFunc != nil { 108 | return m.GetJobOverviewFunc(ctx, url, jobID) 109 | } 110 | return nil, nil 111 | } 112 | 113 | func (m *JobManagerClient) SavepointJob(ctx context.Context, url string, jobID string) (string, error) { 114 | if m.SavepointJobFunc != nil { 115 | return m.SavepointJobFunc(ctx, url, jobID) 116 | } 117 | 118 | return "", nil 119 | } 120 | -------------------------------------------------------------------------------- /pkg/controller/flink/container_utils_test.go: -------------------------------------------------------------------------------- 1 | package flink 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 7 | "github.com/stretchr/testify/assert" 8 | v1 "k8s.io/api/core/v1" 9 | "k8s.io/apimachinery/pkg/api/resource" 10 | ) 11 | 12 | func TestHashForApplication(t *testing.T) { 13 | app := v1beta1.FlinkApplication{} 14 | taskSlots := int32(8) 15 | app.Spec.TaskManagerConfig.TaskSlots = &taskSlots 16 | app.Spec.Parallelism = 4 17 | app.Name = "app-name" 18 | app.Namespace = "ns" 19 | app.Spec.Image = "abcdef" 20 | app.ObjectMeta.Labels = map[string]string{ 21 | "label-k": "label-v", 22 | } 23 | app.ObjectMeta.Annotations = map[string]string{ 24 | "annotation-k": "annotation-v", 25 | } 26 | 27 | h1 := HashForApplication(&app) 28 | assert.Equal(t, 8, len(h1)) 29 | 30 | app.Name = "another-name" 31 | h2 := HashForApplication(&app) 32 | assert.NotEqual(t, h1, h2) 33 | 34 | app.Spec.Image = "zxy" 35 | h3 := HashForApplication(&app) 36 | assert.NotEqual(t, h2, h3) 37 | 38 | app.Labels["label-k"] = "new-v" 39 | h4 := HashForApplication(&app) 40 | assert.NotEqual(t, h3, h4) 41 | 42 | app.Annotations["annotation-k"] = "new-v" 43 | h5 := HashForApplication(&app) 44 | assert.NotEqual(t, h4, h5) 45 | 46 | app.Spec.Parallelism = 7 47 | h6 := HashForApplication(&app) 48 | assert.NotEqual(t, h5, h6) 49 | } 50 | 51 | func TestHashForDifferentResourceScales(t *testing.T) { 52 | app1 := v1beta1.FlinkApplication{} 53 | app1.Spec.TaskManagerConfig.Resources = &v1.ResourceRequirements{ 54 | Requests: v1.ResourceList{ 55 | v1.ResourceCPU: resource.MustParse("0.5"), 56 | v1.ResourceMemory: resource.MustParse("1024Mi"), 57 | }, 58 | Limits: v1.ResourceList{ 59 | v1.ResourceCPU: resource.MustParse("0.5"), 60 | v1.ResourceMemory: resource.MustParse("1024Mi"), 61 | }, 62 | } 63 | 64 | app2 := v1beta1.FlinkApplication{} 65 | app2.Spec.TaskManagerConfig.Resources = &v1.ResourceRequirements{ 66 | Requests: v1.ResourceList{ 67 | v1.ResourceCPU: resource.MustParse("500m"), 68 | v1.ResourceMemory: resource.MustParse("1024Mi"), 69 | }, 70 | Limits: v1.ResourceList{ 71 | v1.ResourceCPU: resource.MustParse("500m"), 72 | v1.ResourceMemory: resource.MustParse("1024Mi"), 73 | }, 74 | } 75 | 76 | assert.Equal(t, HashForApplication(&app1), HashForApplication(&app2)) 77 | } 78 | -------------------------------------------------------------------------------- /pkg/controller/flink/ingress.go: -------------------------------------------------------------------------------- 1 | package flink 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | 7 | flinkapp "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 8 | "github.com/lyft/flinkk8soperator/pkg/controller/common" 9 | "github.com/lyft/flinkk8soperator/pkg/controller/config" 10 | "github.com/lyft/flinkk8soperator/pkg/controller/k8" 11 | networkV1 "k8s.io/api/networking/v1" 12 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 13 | ) 14 | 15 | const AppIngressName = "%s-%s" 16 | 17 | var inputRegex = regexp.MustCompile(`{{[$]jobCluster}}`) 18 | 19 | func ReplaceJobURL(value string, input string) string { 20 | return inputRegex.ReplaceAllString(value, input) 21 | } 22 | 23 | func GetFlinkUIIngressURL(jobName string) string { 24 | return ReplaceJobURL(config.GetConfig().FlinkIngressURLFormat, jobName) 25 | } 26 | 27 | func FetchJobManagerIngressCreateObj(app *flinkapp.FlinkApplication) *networkV1.Ingress { 28 | podLabels := common.DuplicateMap(app.Labels) 29 | podLabels = common.CopyMap(podLabels, k8.GetAppLabel(app.Name)) 30 | 31 | ingressMeta := v1.ObjectMeta{ 32 | Name: getJobManagerServiceName(app), 33 | Labels: podLabels, 34 | Namespace: app.Namespace, 35 | OwnerReferences: []v1.OwnerReference{ 36 | *v1.NewControllerRef(app, app.GroupVersionKind()), 37 | }, 38 | } 39 | 40 | backend := networkV1.IngressBackend{ 41 | Service: &networkV1.IngressServiceBackend{ 42 | Name: getJobManagerServiceName(app), 43 | Port: networkV1.ServiceBackendPort{ 44 | Number: getUIPort(app), 45 | }, 46 | }, 47 | } 48 | 49 | pathType := networkV1.PathTypeImplementationSpecific 50 | ingressSpec := networkV1.IngressSpec{ 51 | Rules: []networkV1.IngressRule{{ 52 | Host: GetFlinkUIIngressURL(getIngressName(app)), 53 | IngressRuleValue: networkV1.IngressRuleValue{ 54 | HTTP: &networkV1.HTTPIngressRuleValue{ 55 | Paths: []networkV1.HTTPIngressPath{{ 56 | Backend: backend, 57 | PathType: &pathType, 58 | }}, 59 | }, 60 | }, 61 | }}, 62 | } 63 | return &networkV1.Ingress{ 64 | ObjectMeta: ingressMeta, 65 | TypeMeta: v1.TypeMeta{ 66 | APIVersion: networkV1.SchemeGroupVersion.String(), 67 | Kind: k8.Ingress, 68 | }, 69 | Spec: ingressSpec, 70 | } 71 | 72 | } 73 | 74 | func getIngressName(app *flinkapp.FlinkApplication) string { 75 | if flinkapp.IsBlueGreenDeploymentMode(app.Spec.DeploymentMode) { 76 | return fmt.Sprintf(AppIngressName, app.Name, string(app.Status.UpdatingVersion)) 77 | } 78 | return app.Name 79 | } 80 | -------------------------------------------------------------------------------- /pkg/controller/flink/ingress_test.go: -------------------------------------------------------------------------------- 1 | package flink 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 7 | 8 | config2 "github.com/lyft/flinkk8soperator/pkg/controller/config" 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func TestReplaceJobUrl(t *testing.T) { 13 | assert.Equal(t, 14 | "ABC.lyft.xyz", 15 | ReplaceJobURL("{{$jobCluster}}.lyft.xyz", "ABC")) 16 | } 17 | 18 | func initTestConfigForIngress() error { 19 | return config2.ConfigSection.SetConfig(&config2.Config{ 20 | FlinkIngressURLFormat: "{{$jobCluster}}.lyft.xyz", 21 | }) 22 | } 23 | func TestGetFlinkUIIngressURL(t *testing.T) { 24 | err := initTestConfigForIngress() 25 | assert.Nil(t, err) 26 | assert.Equal(t, 27 | "ABC.lyft.xyz", 28 | GetFlinkUIIngressURL("ABC")) 29 | } 30 | 31 | func TestGetFlinkUIIngressURLBlueGreenDeployment(t *testing.T) { 32 | err := initTestConfigForIngress() 33 | assert.Nil(t, err) 34 | app := v1beta1.FlinkApplication{} 35 | app.Spec.DeploymentMode = v1beta1.DeploymentModeBlueGreen 36 | app.Name = "ABC" 37 | app.Status.UpdatingVersion = v1beta1.GreenFlinkApplication 38 | assert.Equal(t, "ABC-green", getIngressName(&app)) 39 | assert.Equal(t, 40 | "ABC-green.lyft.xyz", 41 | GetFlinkUIIngressURL(getIngressName(&app))) 42 | } 43 | -------------------------------------------------------------------------------- /pkg/controller/flink/mock/mock_error_handler.go: -------------------------------------------------------------------------------- 1 | package mock 2 | 3 | import ( 4 | "time" 5 | 6 | "k8s.io/utils/clock" 7 | ) 8 | 9 | type IsErrorRetryableFunc func(err error) bool 10 | type IsRetryRemainingFunc func(err error, retryCount int32) bool 11 | type IsErrorFailFastFunc func(err error) bool 12 | type WaitOnErrorFunc func(clock clock.Clock, lastUpdatedTime time.Time) (time.Duration, bool) 13 | type GetRetryDelayFunc func(retryCount int32) time.Duration 14 | type IsTimeToRetryFunc func(clock clock.Clock, lastUpdatedTime time.Time, retryCount int32) bool 15 | 16 | type RetryHandler struct { 17 | IsErrorRetryableFunc IsErrorRetryableFunc 18 | IsRetryRemainingFunc IsRetryRemainingFunc 19 | IsErrorFailFastFunc IsErrorFailFastFunc 20 | WaitOnErrorFunc WaitOnErrorFunc 21 | GetRetryDelayFunc GetRetryDelayFunc 22 | IsTimeToRetryFunc IsTimeToRetryFunc 23 | } 24 | 25 | func (e RetryHandler) IsErrorRetryable(err error) bool { 26 | if e.IsErrorRetryableFunc != nil { 27 | return e.IsErrorRetryableFunc(err) 28 | } 29 | 30 | return false 31 | } 32 | 33 | func (e RetryHandler) IsErrorFailFast(err error) bool { 34 | if e.IsErrorFailFastFunc != nil { 35 | return e.IsErrorFailFastFunc(err) 36 | } 37 | 38 | return false 39 | } 40 | 41 | func (e RetryHandler) IsRetryRemaining(err error, retryCount int32) bool { 42 | if e.IsRetryRemainingFunc != nil { 43 | return e.IsRetryRemainingFunc(err, retryCount) 44 | } 45 | 46 | return false 47 | } 48 | 49 | func (e RetryHandler) WaitOnError(clock clock.Clock, lastUpdatedTime time.Time) (time.Duration, bool) { 50 | if e.WaitOnErrorFunc != nil { 51 | return e.WaitOnErrorFunc(clock, lastUpdatedTime) 52 | } 53 | 54 | return time.Duration(time.Now().UnixNano()), true 55 | } 56 | 57 | func (e RetryHandler) GetRetryDelay(retryCount int32) time.Duration { 58 | if e.GetRetryDelayFunc != nil { 59 | return e.GetRetryDelayFunc(retryCount) 60 | } 61 | 62 | return time.Duration(time.Now().UnixNano()) 63 | } 64 | 65 | func (e RetryHandler) IsTimeToRetry(clock clock.Clock, lastUpdatedTime time.Time, retryCount int32) bool { 66 | if e.IsTimeToRetryFunc != nil { 67 | return e.IsTimeToRetryFunc(clock, lastUpdatedTime, retryCount) 68 | } 69 | return false 70 | } 71 | -------------------------------------------------------------------------------- /pkg/controller/flink/mock/mock_job_manager_controller.go: -------------------------------------------------------------------------------- 1 | package mock 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 7 | ) 8 | 9 | type JobManagerController struct { 10 | CreateIfNotExistFunc CreateIfNotExistFunc 11 | } 12 | 13 | func (m *JobManagerController) CreateIfNotExist( 14 | ctx context.Context, 15 | application *v1beta1.FlinkApplication) (bool, error) { 16 | if m.CreateIfNotExistFunc != nil { 17 | return m.CreateIfNotExistFunc(ctx, application) 18 | } 19 | return false, nil 20 | } 21 | -------------------------------------------------------------------------------- /pkg/controller/flink/mock/mock_task_manager_controller.go: -------------------------------------------------------------------------------- 1 | package mock 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 7 | ) 8 | 9 | type CreateIfNotExistFunc func(ctx context.Context, application *v1beta1.FlinkApplication) (bool, error) 10 | 11 | type TaskManagerController struct { 12 | CreateIfNotExistFunc CreateIfNotExistFunc 13 | } 14 | 15 | func (m *TaskManagerController) CreateIfNotExist( 16 | ctx context.Context, application *v1beta1.FlinkApplication) (bool, error) { 17 | if m.CreateIfNotExistFunc != nil { 18 | return m.CreateIfNotExistFunc(ctx, application) 19 | } 20 | return false, nil 21 | } 22 | -------------------------------------------------------------------------------- /pkg/controller/flinkapplication/controller.go: -------------------------------------------------------------------------------- 1 | package flinkapplication 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/lyft/flytestdlib/promutils" 7 | "github.com/lyft/flytestdlib/promutils/labeled" 8 | 9 | "github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" 10 | "github.com/lyft/flinkk8soperator/pkg/controller/config" 11 | "sigs.k8s.io/controller-runtime/pkg/controller" 12 | 13 | "time" 14 | 15 | "github.com/lyft/flinkk8soperator/pkg/controller/k8" 16 | "github.com/lyft/flytestdlib/contextutils" 17 | "github.com/lyft/flytestdlib/logger" 18 | v1 "k8s.io/api/apps/v1" 19 | coreV1 "k8s.io/api/core/v1" 20 | metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" 21 | "k8s.io/apimachinery/pkg/types" 22 | "sigs.k8s.io/controller-runtime/pkg/cache" 23 | "sigs.k8s.io/controller-runtime/pkg/client" 24 | "sigs.k8s.io/controller-runtime/pkg/event" 25 | "sigs.k8s.io/controller-runtime/pkg/handler" 26 | "sigs.k8s.io/controller-runtime/pkg/manager" 27 | "sigs.k8s.io/controller-runtime/pkg/predicate" 28 | "sigs.k8s.io/controller-runtime/pkg/reconcile" 29 | "sigs.k8s.io/controller-runtime/pkg/source" 30 | ) 31 | 32 | // ReconcileFlinkApplication reconciles a FlinkApplication resource 33 | type ReconcileFlinkApplication struct { 34 | client client.Client 35 | cache cache.Cache 36 | metrics *reconcilerMetrics 37 | flinkStateMachine FlinkHandlerInterface 38 | } 39 | 40 | type reconcilerMetrics struct { 41 | scope promutils.Scope 42 | cacheHit labeled.Counter 43 | cacheMiss labeled.Counter 44 | reconcileError labeled.Counter 45 | } 46 | 47 | func newReconcilerMetrics(scope promutils.Scope) *reconcilerMetrics { 48 | reconcilerScope := scope.NewSubScope("reconciler") 49 | return &reconcilerMetrics{ 50 | scope: reconcilerScope, 51 | cacheHit: labeled.NewCounter("cache_hit", "Flink application resource fetched from cache", reconcilerScope), 52 | cacheMiss: labeled.NewCounter("cache_miss", "Flink application resource missing from cache", reconcilerScope), 53 | reconcileError: labeled.NewCounter("reconcile_error", "Reconcile for application failed", reconcilerScope), 54 | } 55 | } 56 | 57 | func (r *ReconcileFlinkApplication) getResource(ctx context.Context, key types.NamespacedName, obj client.Object) error { 58 | err := r.cache.Get(ctx, key, obj) 59 | if err != nil && k8.IsK8sObjectDoesNotExist(err) { 60 | r.metrics.cacheMiss.Inc(ctx) 61 | return r.client.Get(ctx, key, obj) 62 | } 63 | if err == nil { 64 | r.metrics.cacheHit.Inc(ctx) 65 | } 66 | return err 67 | } 68 | 69 | // For failures, we do not want to retry immediately, as we want the underlying resource to recover. 70 | // At the same time, we want to retry faster than the regular success interval. 71 | func (r *ReconcileFlinkApplication) getFailureRetryInterval() time.Duration { 72 | return config.GetConfig().ResyncPeriod.Duration / 2 73 | } 74 | 75 | func (r *ReconcileFlinkApplication) getReconcileResultForError(err error) reconcile.Result { 76 | if err == nil { 77 | return reconcile.Result{} 78 | } 79 | return reconcile.Result{ 80 | RequeueAfter: r.getFailureRetryInterval(), 81 | } 82 | } 83 | 84 | func (r *ReconcileFlinkApplication) Reconcile(ctx context.Context, request reconcile.Request) (reconcile.Result, error) { 85 | ctx = contextutils.WithNamespace(ctx, request.Namespace) 86 | ctx = contextutils.WithAppName(ctx, request.Name) 87 | typeMeta := metaV1.TypeMeta{ 88 | Kind: v1beta1.FlinkApplicationKind, 89 | APIVersion: v1beta1.SchemeGroupVersion.String(), 90 | } 91 | // Fetch the FlinkApplication instance 92 | instance := &v1beta1.FlinkApplication{ 93 | TypeMeta: typeMeta, 94 | } 95 | 96 | err := r.getResource(ctx, request.NamespacedName, instance) 97 | if err != nil { 98 | if k8.IsK8sObjectDoesNotExist(err) { 99 | // Request object not found, could have been deleted after reconcile request. 100 | // Return and don't requeue 101 | return reconcile.Result{}, nil 102 | } 103 | // Error reading the object - we will check again in next loop 104 | return r.getReconcileResultForError(err), nil 105 | } 106 | // We are seeing instances where getResource is removing TypeMeta 107 | instance.TypeMeta = typeMeta 108 | ctx = contextutils.WithPhase(ctx, string(instance.Status.Phase)) 109 | err = r.flinkStateMachine.Handle(ctx, instance) 110 | if err != nil { 111 | r.metrics.reconcileError.Inc(ctx) 112 | logger.Warnf(ctx, "Failed to reconcile resource %v: %v", request.NamespacedName, err) 113 | } 114 | return r.getReconcileResultForError(err), err 115 | } 116 | 117 | // Add creates a new FlinkApplication Controller and adds it to the Manager. The Manager will set fields on the Controller 118 | // and Start it when the Manager is Started. 119 | func Add(ctx context.Context, mgr manager.Manager, cfg config.RuntimeConfig) error { 120 | k8sCluster := k8.NewK8Cluster(mgr, cfg) 121 | eventRecorder := mgr.GetEventRecorderFor(config.AppName) 122 | flinkStateMachine := NewFlinkStateMachine(k8sCluster, eventRecorder, cfg) 123 | 124 | metrics := newReconcilerMetrics(cfg.MetricsScope) 125 | reconciler := ReconcileFlinkApplication{ 126 | client: mgr.GetClient(), 127 | cache: mgr.GetCache(), 128 | metrics: metrics, 129 | flinkStateMachine: flinkStateMachine, 130 | } 131 | 132 | c, err := controller.New(config.AppName, mgr, controller.Options{ 133 | MaxConcurrentReconciles: config.GetConfig().Workers, 134 | Reconciler: &reconciler, 135 | }) 136 | 137 | if err != nil { 138 | return err 139 | } 140 | 141 | if err = c.Watch(&source.Kind{Type: &v1beta1.FlinkApplication{}}, &handler.EnqueueRequestForObject{}); err != nil { 142 | return err 143 | } 144 | 145 | // Watch deployments and services for the application 146 | if err := c.Watch(&source.Kind{Type: &v1.Deployment{}}, &handler.Funcs{}, getPredicateFuncs()); err != nil { 147 | return err 148 | } 149 | 150 | if err := c.Watch(&source.Kind{Type: &coreV1.Service{}}, &handler.Funcs{}, getPredicateFuncs()); err != nil { 151 | return err 152 | } 153 | return nil 154 | } 155 | 156 | func isOwnedByFlinkApplication(ownerReferences []metaV1.OwnerReference) bool { 157 | for _, ownerReference := range ownerReferences { 158 | if ownerReference.APIVersion == v1beta1.SchemeGroupVersion.String() && 159 | ownerReference.Kind == v1beta1.FlinkApplicationKind { 160 | return true 161 | } 162 | } 163 | return false 164 | } 165 | 166 | // Predicate filters events before enqueuing the keys. 167 | // We are only interested in kubernetes objects that are owned by the FlinkApplication 168 | // This filters all the objects not owned by the flinkApplication, and ensures only subset reaches event handlers 169 | func getPredicateFuncs() predicate.Funcs { 170 | return predicate.Funcs{ 171 | CreateFunc: func(e event.CreateEvent) bool { 172 | return isOwnedByFlinkApplication(e.Object.GetOwnerReferences()) 173 | }, 174 | UpdateFunc: func(e event.UpdateEvent) bool { 175 | return isOwnedByFlinkApplication(e.ObjectNew.GetOwnerReferences()) 176 | }, 177 | DeleteFunc: func(e event.DeleteEvent) bool { 178 | return isOwnedByFlinkApplication(e.Object.GetOwnerReferences()) 179 | }, 180 | GenericFunc: func(e event.GenericEvent) bool { 181 | return isOwnedByFlinkApplication(e.Object.GetOwnerReferences()) 182 | }, 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /pkg/controller/k8/mock/mock_k8.go: -------------------------------------------------------------------------------- 1 | package mock 2 | 3 | import ( 4 | "context" 5 | 6 | v1 "k8s.io/api/apps/v1" 7 | corev1 "k8s.io/api/core/v1" 8 | "sigs.k8s.io/controller-runtime/pkg/client" 9 | ) 10 | 11 | type GetDeploymentsWithLabelFunc func(ctx context.Context, namespace string, labelMap map[string]string) (*v1.DeploymentList, error) 12 | type CreateK8ObjectFunc func(ctx context.Context, object client.Object) error 13 | type GetServiceFunc func(ctx context.Context, namespace string, name string, version string) (*corev1.Service, error) 14 | type GetServiceWithLabelFunc func(ctx context.Context, namespace string, labelMap map[string]string) (*corev1.ServiceList, error) 15 | type UpdateK8ObjectFunc func(ctx context.Context, object client.Object) error 16 | type UpdateStatusFunc func(ctx context.Context, object client.Object) error 17 | type DeleteK8ObjectFunc func(ctx context.Context, object client.Object) error 18 | 19 | type K8Cluster struct { 20 | GetDeploymentsWithLabelFunc GetDeploymentsWithLabelFunc 21 | GetServiceFunc GetServiceFunc 22 | GetServicesWithLabelFunc GetServiceWithLabelFunc 23 | CreateK8ObjectFunc CreateK8ObjectFunc 24 | UpdateK8ObjectFunc UpdateK8ObjectFunc 25 | UpdateStatusFunc UpdateStatusFunc 26 | DeleteK8ObjectFunc DeleteK8ObjectFunc 27 | } 28 | 29 | func (m *K8Cluster) GetDeploymentsWithLabel(ctx context.Context, namespace string, labelMap map[string]string) (*v1.DeploymentList, error) { 30 | if m.GetDeploymentsWithLabelFunc != nil { 31 | return m.GetDeploymentsWithLabelFunc(ctx, namespace, labelMap) 32 | } 33 | return nil, nil 34 | } 35 | 36 | func (m *K8Cluster) GetServicesWithLabel(ctx context.Context, namespace string, labelMap map[string]string) (*corev1.ServiceList, error) { 37 | if m.GetDeploymentsWithLabelFunc != nil { 38 | return m.GetServicesWithLabelFunc(ctx, namespace, labelMap) 39 | } 40 | return nil, nil 41 | } 42 | 43 | func (m *K8Cluster) GetService(ctx context.Context, namespace string, name string, version string) (*corev1.Service, error) { 44 | if m.GetServiceFunc != nil { 45 | return m.GetServiceFunc(ctx, namespace, name, version) 46 | } 47 | return nil, nil 48 | } 49 | 50 | func (m *K8Cluster) CreateK8Object(ctx context.Context, object client.Object) error { 51 | if m.CreateK8ObjectFunc != nil { 52 | return m.CreateK8ObjectFunc(ctx, object) 53 | } 54 | return nil 55 | } 56 | 57 | func (m *K8Cluster) UpdateK8Object(ctx context.Context, object client.Object) error { 58 | if m.UpdateK8ObjectFunc != nil { 59 | return m.UpdateK8ObjectFunc(ctx, object) 60 | } 61 | return nil 62 | } 63 | 64 | func (m *K8Cluster) UpdateStatus(ctx context.Context, object client.Object) error { 65 | if m.UpdateStatusFunc != nil { 66 | return m.UpdateStatusFunc(ctx, object) 67 | } 68 | return nil 69 | } 70 | 71 | func (m *K8Cluster) DeleteK8Object(ctx context.Context, object client.Object) error { 72 | if m.DeleteK8ObjectFunc != nil { 73 | return m.DeleteK8ObjectFunc(ctx, object) 74 | } 75 | return nil 76 | } 77 | -------------------------------------------------------------------------------- /pkg/controller/k8/utils.go: -------------------------------------------------------------------------------- 1 | package k8 2 | 3 | import ( 4 | v1 "k8s.io/api/apps/v1" 5 | k8serrors "k8s.io/apimachinery/pkg/api/errors" 6 | ) 7 | 8 | const ( 9 | AppKey = "flink-app" 10 | ) 11 | 12 | func IsK8sObjectDoesNotExist(err error) bool { 13 | return k8serrors.IsNotFound(err) || k8serrors.IsGone(err) || k8serrors.IsResourceExpired(err) 14 | } 15 | 16 | func GetAppLabel(appName string) map[string]string { 17 | return map[string]string{ 18 | AppKey: appName, 19 | } 20 | } 21 | 22 | func GetDeploymentWithName(deployments []v1.Deployment, name string) *v1.Deployment { 23 | if len(deployments) == 0 { 24 | return nil 25 | } 26 | for _, deployment := range deployments { 27 | if deployment.Name == name { 28 | return &deployment 29 | } 30 | } 31 | return nil 32 | } 33 | -------------------------------------------------------------------------------- /pkg/controller/k8/utils_test.go: -------------------------------------------------------------------------------- 1 | package k8 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | v1 "k8s.io/api/apps/v1" 8 | ) 9 | 10 | func TestGetAppLabel(t *testing.T) { 11 | appName := "app_name" 12 | appLabel := GetAppLabel(appName) 13 | assert.Equal(t, map[string]string{ 14 | "flink-app": appName, 15 | }, appLabel) 16 | } 17 | 18 | func TestGetDeploymentWithName(t *testing.T) { 19 | name := "jm-name" 20 | dep := v1.Deployment{} 21 | dep.Name = name 22 | deployments := []v1.Deployment{ 23 | dep, 24 | } 25 | actualDeployment := GetDeploymentWithName(deployments, name) 26 | assert.NotNil(t, actualDeployment) 27 | assert.Equal(t, dep, *actualDeployment) 28 | } 29 | 30 | func TestGetDeploymentNotExists(t *testing.T) { 31 | name := "jm-name" 32 | dep := v1.Deployment{} 33 | dep.Name = name 34 | deployments := []v1.Deployment{ 35 | dep, 36 | } 37 | actualDeployment := GetDeploymentWithName(deployments, "random") 38 | assert.Nil(t, actualDeployment) 39 | } 40 | -------------------------------------------------------------------------------- /script/lint: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #echo "************************ golint *********************************" 4 | #for pkg in $(glide nv); do 5 | # golint $pkg | grep -v comment 6 | #done 7 | #echo "*****************************************************************" 8 | 9 | echo "************************ govet **********************************" 10 | go vet $(glide nv) 11 | echo "*****************************************************************" 12 | 13 | echo "************************ goimports ******************************" 14 | goimports -d $(find . -type f -name '*.go' -not -path "./vendor/*") 15 | echo "*****************************************************************" 16 | 17 | echo "************************ gofmt *********************************" 18 | gofmt -s -w $(find . -type f -name '*.go' -not -path "./vendor/*") 19 | echo "*****************************************************************" -------------------------------------------------------------------------------- /tmp/build/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:3.6 2 | 3 | RUN adduser -D flinkk8soperator 4 | USER flinkk8soperator 5 | 6 | ADD tmp/_output/bin/flinkk8soperator /usr/local/bin/flinkk8soperator 7 | -------------------------------------------------------------------------------- /tmp/build/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | set -o pipefail 6 | 7 | if ! which go > /dev/null; then 8 | echo "golang needs to be installed" 9 | exit 1 10 | fi 11 | 12 | BIN_DIR="$(pwd)/tmp/_output/bin" 13 | mkdir -p ${BIN_DIR} 14 | PROJECT_NAME="flinkk8soperator" 15 | REPO_PATH="github.com/lyft/flinkk8soperator/flinkk8soperator" 16 | BUILD_PATH="${REPO_PATH}/cmd/${PROJECT_NAME}" 17 | echo "building "${PROJECT_NAME}"..." 18 | GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o ${BIN_DIR}/${PROJECT_NAME} $BUILD_PATH 19 | -------------------------------------------------------------------------------- /tmp/build/docker_build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if ! which docker > /dev/null; then 4 | echo "docker needs to be installed" 5 | exit 1 6 | fi 7 | 8 | : ${IMAGE:?"Need to set IMAGE, e.g. gcr.io//-operator"} 9 | 10 | echo "building container ${IMAGE}..." 11 | docker build -t "${IMAGE}" -f tmp/build/Dockerfile . 12 | -------------------------------------------------------------------------------- /tmp/codegen/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tmp/codegen/update-generated.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | set -o pipefail 6 | 7 | bash ./vendor/k8s.io/code-generator/generate-groups.sh \ 8 | deepcopy,client \ 9 | github.com/lyft/flinkk8soperator/pkg/client \ 10 | github.com/lyft/flinkk8soperator/pkg/apis \ 11 | app:v1beta1 \ 12 | --go-header-file "./tmp/codegen/boilerplate.go.txt" 13 | -------------------------------------------------------------------------------- /tools.go: -------------------------------------------------------------------------------- 1 | //go:build tools 2 | 3 | package tools 4 | 5 | import ( 6 | _ "k8s.io/code-generator" 7 | ) 8 | -------------------------------------------------------------------------------- /version/version.go: -------------------------------------------------------------------------------- 1 | package version 2 | 3 | var ( 4 | Version = "0.5.0" 5 | ) 6 | --------------------------------------------------------------------------------