├── docker ├── Dockerfile.app └── Dockerfile.base ├── scripts ├── cd-base.bash ├── cd-service.bash └── build-cd-base.bash ├── flowchart.txt ├── cfgupdater.md └── README.md /docker/Dockerfile.app: -------------------------------------------------------------------------------- 1 | # ARGs for the FROM target must occur first. 2 | # Allowing a dynamic source image allows developers to quickly build a single image, 3 | # rather than building the full Dockerfile.cd that compiles many commands. 4 | ARG BIN_IMAGE 5 | FROM ${BIN_IMAGE} as bin_image 6 | 7 | FROM alpine:3 8 | 9 | # Add certificates first, as that may be a common layer with other alpine-based images. 10 | RUN apk add --no-cache ca-certificates 11 | 12 | # The cmd will not change, so set it early for better layer caching. 13 | CMD ["/foo-svc"] 14 | 15 | # Again, having a dynamic source directory allows simplified local development. 16 | ARG BIN_SRC_DIR=/app-bins 17 | COPY --from=bin_image ${BIN_SRC_DIR}/foo-svc /foo-svc 18 | -------------------------------------------------------------------------------- /scripts/cd-base.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu -o pipefail 4 | 5 | # get current branch and commit sha, and force branch name to valid docker tag character set 6 | # allow forcing branch name regardless of current banch using FORCE_CI_* variables 7 | readonly branch_name="${FORCE_CI_BRANCH_NAME:-$(git rev-parse --abbrev-ref HEAD | tr -d '\n' | tr -c '[:alnum:].-' _)}" 8 | readonly git_sha="${FORCE_CI_GIT_SHA:-$(git rev-parse HEAD)}" 9 | 10 | # pull image if it exists, ignore if it does not 11 | docker pull "docker.example.com/app-bins:${branch_name}" || true 12 | 13 | docker build --pull \ 14 | --cache-from "docker.example.com/app-bins:${branch_name}" \ 15 | -t "docker.example.com/app-bins:${git_sha}" \ 16 | -f Dockerfile.cd . 17 | 18 | docker tag \ 19 | "docker.example.com/app-bins:${git_sha}" \ 20 | "docker.example.com/app-bins:${branch_name}" 21 | 22 | docker push "docker.example.com/app-bins:${git_sha}" 23 | docker push "docker.example.com/app-bins:${branch_name}" 24 | -------------------------------------------------------------------------------- /docker/Dockerfile.base: -------------------------------------------------------------------------------- 1 | # This Dockerfile produces a single Docker image that holds a whitelisted set of built binaries, 2 | # as defined in scripts/build-cd-base.bash. 3 | # Downstream builds will copy only the binaries they need from this base layer. 4 | 5 | FROM docker.io/golang:1.12-buster as builder 6 | 7 | # Workdir isn't going to change, so set it first. 8 | # We don't need to be in a fake GOPATH anymore. 9 | WORKDIR /app 10 | 11 | # Copy over the shell scripts we use inside other downstream containers. 12 | # These scripts are less likely to change than the binaries we compile. 13 | COPY \ 14 | ./service/foo/do_something.sh \ 15 | ./service/bar/do_something_else.sh \ 16 | /app-bins/ 17 | 18 | # Assuming the vendor directory is up-to-date with `go mod vendor`. 19 | COPY . /app 20 | 21 | # Build the whitelisted binaries. 22 | RUN APP_BIN_DEST=/app-bins CLEAN_GOCACHE=1 /app/scripts/build-cd-base.bash 23 | 24 | # Our final image is only going to hold the binaries and any shell scripts needed for other images. 25 | FROM scratch 26 | COPY --from=builder /app/ /app-bins/ 27 | -------------------------------------------------------------------------------- /scripts/cd-service.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu -o pipefail 4 | 5 | # service to build; strip leading "cd-app-" to avoid passing CI JOB name directly from config.yml 6 | readonly service="$(echo "${1:?App required}" | sed 's,^cd-service-,,')" 7 | 8 | # get current branch and commit sha, and force branch name to valid docker tag character set 9 | # allow forcing branch name regardless of current branch using FORCE_CI_* variables 10 | readonly branch_name="${FORCE_CI_BRANCH_NAME:-$(git rev-parse --abbrev-ref HEAD | tr -d '\n' | tr -c '[:alnum:].-' _)}" 11 | readonly git_sha="${FORCE_CI_GIT_SHA:-$(git rev-parse HEAD)}" 12 | 13 | # pull image if it exists, ignore if it does not 14 | docker pull "docker.example.com/app-cd-${service}:${branch_name}" || true 15 | 16 | docker build --pull \ 17 | --cache-from "docker.example.com/app-cd-${service}:${branch_name}" \ 18 | -t "docker.example.com/app-cd-${service}:${git_sha}" \ 19 | --build-arg BIN_IMAGE=docker.example.com/app-bins:${git_sha} \ 20 | -f "apps/${service}/Dockerfile.cd" . 21 | 22 | docker tag \ 23 | "docker.example.com/app-cd-${service}:${git_sha}" \ 24 | "docker.example.com/app-cd-${service}:${branch_name}" 25 | 26 | docker push "docker.example.com/app-cd-${service}:${git_sha}" 27 | docker push "docker.example.com/app-cd-${service}:${branch_name}" 28 | 29 | # Construct a JSON object for this image's Tag and Digest. 30 | # I was not able to retrieve the digest when calling docker images against a specific tag, 31 | # so get all the tags and digests for the given image and grep for the matching tag. 32 | jq --null-input --sort-keys \ 33 | --arg tagDigest "$(docker images "docker.example.com/app-cd-${service}" --format '{{.Tag}} {{.Digest}}' | grep "^${git_sha}")" \ 34 | --arg imgPrefix "docker.example.com/app-cd-$service" \ 35 | --arg serviceKey "$SERVICEKEY" \ 36 | '$tagDigest | split(" ") as $td | { 37 | ($serviceKey): { 38 | Tag: ($imgPrefix + ":" + $td[0]), 39 | Digest: ($imgPrefix + "@" + $td[1]), 40 | } 41 | }' | tee "/artifacts/$SERVICEKEY.json" 42 | -------------------------------------------------------------------------------- /scripts/build-cd-base.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script is called from Dockerfile.cd to build all of the necessary commands in one base image, 4 | # so that we amortize compile costs. 5 | # Then, downstream images can simply copy their binary from this base image. 6 | 7 | set -eux 8 | 9 | if [ -z "$APP_BIN_DEST" ]; then 10 | # Requiring the output as an explicit argument, 11 | # so the script can be run in container or on workstation. 12 | >&2 echo '$APP_BIN_DEST must be set as destination directory for built binaries.' 13 | exit 1 14 | fi 15 | 16 | # Sanity checks. 17 | if [ ! -f go.mod ]; then 18 | >&2 echo 'This script must be run from the root of the app repository.' 19 | exit 1 20 | fi 21 | modheader="$(head -n 1 go.mod)" 22 | if [ "$modheader" != 'module example.com/app' ]; then 23 | >&2 echo 'go.mod detected, but it does not appear to be the app module.' 24 | exit 1 25 | fi 26 | 27 | # All of the commands we are going to build, as a bash array. 28 | # There are some that we will never build, so we whitelist required builds instead of using ./cmd/... . 29 | cmds=( 30 | foo 31 | bar 32 | baz 33 | ) 34 | 35 | # Common go flags to the upcoming go build commands. 36 | export CGO_ENABLED=0 GOOS=linux GO111MODULE=on 37 | 38 | # GOFLAGS is special. 39 | # We always want -mod=vendor in this script. 40 | # We want to hardcode an empty buildid so that builds are bit-for-bit identical. 41 | # See https://github.com/golang/go/issues/33772. 42 | # After upgrading to Go 1.13, it will probably make sense to add -trimpath. 43 | export GOFLAGS='-mod=vendor -ldflags=-buildid=' 44 | 45 | # Helpful log output to see the exact version of Go used. 46 | go version 47 | 48 | # Build ./cmd/foo for foo in $cmds (https://stackoverflow.com/a/12744170). 49 | # And build it as one command, to maximize concurrency. 50 | # When we switch to Go 1.13, we can use "-o $dir" to emit all the executables, 51 | # but until then, we build once to warm the cache and then emit the individual binaries. 52 | # Run with time just for simple insight on how long this takes. 53 | time go build "${cmds[@]/#/./cmd/}" 54 | 55 | # Again, this can be refactored away after we move to Go 1.13. 56 | for cmd in ${cmds[@]}; do 57 | time go build -o "$APP_BIN_DEST/$cmd" "./cmd/$cmd" 58 | done 59 | 60 | # If you run this script on your workstation, you probably don't want to blow away your go cache. 61 | # But if you run it in a Docker container, the cache just wastes disk space / image size. 62 | # (Variable expansion with defaulting, to avoid an unset variable error.) 63 | if [ -n "${CLEAN_GOCACHE:-}" ]; then 64 | go clean -cache 65 | fi 66 | -------------------------------------------------------------------------------- /flowchart.txt: -------------------------------------------------------------------------------- 1 | ┌───────────────────────────┐ 2 | │ │ 3 | │ app.git │ 4 | │ │ 5 | └───────────────────────────┘ 6 | │ 7 | │ 8 | Master 9 | branch is 10 | updated 11 | │ 12 | ▼ 13 | ┌────────────────────────────┐ 14 | │ CI pipeline builds Docker │ 15 | │images for every service in │ 16 | │ the application │ 17 | └────────────────────────────┘ 18 | │ 19 | CI posts new image tags and 20 | digests to cfgupdater 21 | │ 22 | │ 23 | ▼ 24 | ┌────────────────────────────────────┐ 25 | │ cfgupdater creates a new commit in │ 26 | │ cfg-app to update Staging images │ 27 | │ to the most recent app commit │ 28 | │ │ 29 | └────────────────────────────────────┘ ┌─────────────────────┐ 30 | │ │ Developer directly │ 31 | │ │ modifies Jsonnet to │ 32 | Effectively, git ┌──────────│ update Kubernetes │ 33 | push │ │ resource(s) │ 34 | │ │ └─────────────────────┘ 35 | ▼ ▼ 36 | ┌───────────────────────────────┐ 37 | │ cfg-app.git │ 38 | │ ├─────────────────┐ 39 | ┌──────▶│(Jsonnet to generate k8s YAML) │ │ 40 | │ ┌────▶│ ├───────┐ │ 41 | │ │ └───────────────────────────────┘ │ │ 42 | │ │ │ │ │ 43 | │ │ │ │ │ 44 | │ │ Argo observes cfg-app.git │ │ 45 | │ │ update on Staging env │ │ 46 | │ │ config │ │ 47 | │ │ │ │ │ 48 | │ │ ▼ │ │ 49 | │ │ ┌──────────────────────────────┐ │ │ 50 | │ │ │ Argo CD deploys to Staging │ │ │ 51 | │ │ │ envs │ │ │ 52 | │ │ │ │ Argo observes │ 53 | │ │ └──────────────────────────────┘ cfg-app.git │ 54 | │ │ │ update on │ 55 | │ │ │ Internal env │ 56 | │ │ Argo CD post-deploy hook config │ 57 | │ │ indicating successful deploy │ │ 58 | │ │ │ │ │ 59 | │ │ │ │ │ 60 | │ │ ▼ │ │ 61 | │ │ ┌─────────────────────────────────────┐ │ │ 62 | │ │ │ │ │ │ 63 | │ │ │ cfgupdater creates a new commit in │ │ │ 64 | │ │ │ cfg-app, promoting the images │ │ │ 65 | │ └──│ successfully deployed to Staging, │ │ Argo observes 66 | │ │ into Internal environments │ │ cfg-app.git 67 | │ │ │ │ update on 68 | │ └─────────────────────────────────────┘ │ Production env 69 | │ │ config 70 | │ │ │ 71 | │ │ │ 72 | │ ┌──────────────────────────────┐ │ │ 73 | │ │ Argo CD deploys to Internal │ │ │ 74 | │ │ envs │◀──────┘ │ 75 | │ └──────────────────────────────┘ │ 76 | │ │ │ 77 | │ Argo CD post-deploy │ 78 | │ hook indicating │ 79 | │ successful deploy │ 80 | │ │ │ 81 | │ │ │ 82 | │ ▼ │ 83 | │ ┌─────────────────────────────────────┐ │ 84 | │ │ │ │ 85 | │ │ cfgupdater creates a new commit in │ │ 86 | │ │ cfg-app, promoting the images │ │ 87 | └─────│ successfully deployed to Internal, │ │ 88 | │ into Production environments │ │ 89 | │ │ │ 90 | └─────────────────────────────────────┘ │ 91 | │ 92 | │ 93 | ┌──────────────────────────────┐ │ 94 | │Argo CD deploys to Production │ │ 95 | │ environments │◀───────────────┘ 96 | │ │ 97 | └──────────────────────────────┘ 98 | 99 | Created with Monodraw 100 | -------------------------------------------------------------------------------- /cfgupdater.md: -------------------------------------------------------------------------------- 1 | # cfgupdater 2 | 3 | This document describes the cfgupdater service in detail. 4 | Refer to [README.md](/README.md) for an overview of the overall patterns InfluxData uses for CD/GitOps. 5 | 6 | ## Overview 7 | 8 | cfgupdater is implemented as a [GitHub App](https://developer.github.com/apps/). 9 | The overall patterns would generally be the same if you were writing a service to interact with another source control provider. 10 | 11 | The API endpoints look like: 12 | 13 | - One endpoint to be triggered from `app`'s CI pipeline to announce a new set of images 14 | - One set of endpoints to be triggered from Argo CD post sync hooks, to handle promoting images between environment sets 15 | - One endpoint to receive GitHub webhooks for commit status updates on the cfg-app repository. 16 | 17 | You will notice that the first two endpoints map to the "accessible entrypoints" of the config repository. 18 | The third endpoint is so that we follow the 19 | ["Not Rocket Science Rule of Software Engineering"](https://graydon.livejournal.com/186550.html) 20 | by pushing automatic commits to a branch that is rebased on master, 21 | and master is always fast-forwarded to a commit that has already passed CI. 22 | 23 | ## General Operation 24 | 25 | We will assume that `master` is the branch of `cfg-app.git` that Argo CD observes, 26 | and `auto` is the arbitrary branch that cfgupdater uses for its automatic commits. 27 | 28 | cfgupdater creates commits on the auto branch and attempts to keep auto rebased on master. 29 | Once CI reports a commit on the auto branch has passed, 30 | cfgupdater can fast-forward the master branch to that commit. 31 | 32 | The automatic commits have [trailers](https://git-scm.com/docs/git-interpret-trailers) 33 | which are easily machine-parsed for fuller audit details. 34 | 35 | ## Implementation Details 36 | 37 | The code for our cfgupdater implementation is not ready to share, 38 | but I can share the fine details on exactly what git operations we take at each stage of the process. 39 | 40 | The details are outlined as rough, untested shell scripts. 41 | For any functions that start with `helper_`, see the Helpers section at the bottom of this document. 42 | 43 | Assume that all scripts run in the root directory of the `cfg-app.git` working tree, 44 | and that all scripts are run effectively as `set -euo pipefail`. 45 | 46 | ### Startup 47 | 48 | The process clones the full `cfg-app.git` repository. 49 | If the auto branch does not exist, it is created at the current commit at HEAD of master. 50 | 51 | ### New Images Published 52 | 53 | When `app`'s CI pipeline finishes, it publishes a JSON object to cfgupdater that looks like: 54 | 55 | ```json 56 | { 57 | "Service1": { 58 | "Digest": "docker.example.com/service1@sha256:6a9ca693f6fff83215c00b653bcf2106124705ad538dc509373523fdd6cefdb4", 59 | "Tag": "docker.example.com/service1:7d1043473d55bfa90e8530d35801d4e381bc69f0" 60 | }, 61 | "Service2": { 62 | "Digest": "docker.example.com/service2@sha256:621f0ce9f70ad34dcc76d4b28c0e16ff30afa7f0318ec9ed85f9979255006a65", 63 | "Tag": "docker.example.com/service2:7d1043473d55bfa90e8530d35801d4e381bc69f0" 64 | } 65 | } 66 | ``` 67 | 68 | Assumptions: 69 | - cfgupdater saves that JSON input as a file `/tmp/images.json`. 70 | - The full application SHA is available as `$APP_SHA`. 71 | 72 | Then cfgupdater effectively runs: 73 | 74 | ```sh 75 | helper_align_auto || helper_refresh 76 | 77 | # Detach from the auto branch so that we don't have persist changes back to the branch 78 | # until everything is done. 79 | git checkout --detach auto || helper_refresh 80 | 81 | # Apply the new images to the staging environment, and regenerate the YAML. 82 | make introduce_images IMAGE_FILE=/tmp/images.json 83 | 84 | # Format the commit message, and use commit -a to commit all changes to updated files. 85 | git commit -a -m "chore: update app to ${APP_SHA:0:10} 86 | 87 | Autocommit-App-SHA: $APP_SHA 88 | Autocommit-Target: staging 89 | Autocommit-Reason: new images published 90 | " 91 | 92 | # Set the branch back now that the commit is finalized. 93 | git checkout -B auto HEAD 94 | 95 | git push origin auto:auto 96 | ``` 97 | 98 | If you are concerned that your CI pipeline may replay a build, 99 | this stage could see if the image has been published before by inspecting the output of 100 | `git log --format='%(trailers:key=Autocommit-App-SHA,valueonly)' auto`. 101 | However, note that this format style [requires git 2.22](https://github.com/git/git/blob/7a6a90c6ec48fc78c83d7090d6c1b95d8f3739c0/Documentation/RelNotes/2.22.0.txt#L21-L23) or newer, 102 | and as of this writing, [the newest git you can apt-get install on Debian Buster or Stretch, even with backports, is git 2.20](https://unix.stackexchange.com/q/559437). 103 | 104 | ### CI Status Reported 105 | 106 | We subscribe to GitHub webhook events for the `cfg-app.git` repository. 107 | 108 | When GitHub reports the updated status for a commit, it includes the names of the branches that include the commit. 109 | Pending statuses are ignored. 110 | We only care about a particular set of status checks; 111 | if they all pass, we can fast-forward master master to that commit, 112 | or if any fail, we "evict" that commit from the auto branch. 113 | 114 | In these flows we rely heavily on `git merge-base --is-ancestor` to check that commits are ordered as expectedc, 115 | so you may want to read [its documentation](https://git-scm.com/docs/git-merge-base) 116 | if you are unfamiliar with the command. 117 | 118 | #### All Status Checks Passed 119 | 120 | When all internally required status checks pass, we fast-forward master to that new commit. 121 | Note that since GitHub reports a single status at a time, you may need to make a separate API call to GitHub to check whether the other required status checks have passed. 122 | 123 | Assumptions: 124 | - The commit whose status passed is available as `$GREEN_SHA`. 125 | 126 | ```sh 127 | helper_align_auto || helper_refresh 128 | 129 | # The commit must be on the auto branch. 130 | git merge-base --is-ancestor $GREEN_SHA auto || helper_refresh 131 | 132 | # And ensure master will be fast-forwarded the new commit. 133 | # Note that if master is already on this commit, the command will still succeed. 134 | # This is fine, as the later merge attempt and push will be a no-op. 135 | git merge-base --is-ancestor master $GREEN_SHA || helper_refresh 136 | 137 | git checkout master 138 | git merge --ff-only $GREEN_SHA 139 | git push origin master:master || helper_refresh 140 | ``` 141 | 142 | #### Any Status Check Failed 143 | 144 | We need to rebase away the commit whose status failed. 145 | 146 | Assumptions: 147 | - The commit whose status failed is available as `$RED_SHA`. 148 | 149 | ```sh 150 | helper_align_auto || helper_refresh 151 | 152 | # $RED_SHA must be an ancestor of auto, and master must be an ancestor of $RED_SHA. 153 | git merge-base --is-ancestor $RED_SHA auto || helper_refresh 154 | git merge-base --is-ancestor master $RED_SHA || helper_refresh 155 | 156 | # git merge-base --is-ancestor x y will exit 0 if x and y point at the same commit; 157 | # so as one last sanity check, make sure master's commit isn't the same as $RED_SHA. 158 | test "$(git rev-parse --verify master)" != "$RED_SHA" || helper_refresh 159 | 160 | ORIG_AUTO_SHA="$(git rev-parse auto)" 161 | 162 | # Rebase away the actual commit. 163 | git rebase --onto "${RED_SHA}^" "$RED_SHA" auto || helper_refresh 164 | 165 | # Force-push with lease our new auto ref. 166 | git push --force-with-lease=auto:"$ORIG_AUTO_SHA" origin auto:auto || helper_refresh 167 | ``` 168 | 169 | ### Helpers 170 | 171 | Here are the details on the helpers referenced in the above implementation details. 172 | 173 | #### `helper_align_auto` 174 | 175 | The rebase at the end of `helper_align_auto` is likely the most brittle part of this git automation. 176 | 177 | One potentially more intelligent solution would inspect the trailers on the commits in the auto branch, 178 | and then "replay" those actions as new commits on master. 179 | 180 | For now, we are using the rebase strategy, but we are specifically regenerating YAML, 181 | as detailed in the README, rather than allowing the possibility of merge conflicts in those generated files. 182 | Before rebasing, we run the appropriate `git config` commands to configure the custom merge driver. 183 | 184 | ```sh 185 | MASTER_SHA="$(git rev-parse master)" 186 | AUTO_SHA="$(git rev-parse auto)" 187 | 188 | if [ "$MASTER_SHA" == "$AUTO_SHA" ]; then 189 | # Branches are aligned. Nothing to do. 190 | exit 0 191 | fi 192 | 193 | if git merge-base --is-ancestor "$AUTO_SHA" "$MASTER_SHA"; then 194 | # auto is an ancestor of master. Maybe someone pushed directly to master. 195 | # Locally reset the auto branch to match master. 196 | git branch -f auto master 197 | exit 0 198 | fi 199 | 200 | if git merge-base --is-ancestor "$MASTER_SHA" "$AUTO_SHA"; then 201 | # master is an ancestor of auto. That just means auto has advanced past master. This is fine. 202 | exit 0 203 | fi 204 | 205 | # At this point, master is not an ancestor of auto, nor vice versa. 206 | # Try to rebase auto on master. 207 | MERGE_BASE="$(git merge-base "$MASTER_SHA" "$AUTO_SHA")" 208 | git rebase "$MASTER_SHA" auto 209 | git push --force-with-lease=auto:"$AUTO_SHA" origin auto:auto 210 | ``` 211 | 212 | #### `helper_refresh` 213 | 214 | This helper is run when a git command has failed, and it optimistically retries the entire script 215 | after creating a fresh clone of the application git repository. 216 | 217 | During the second run of the script, calls to `helper_refresh` have no effect. 218 | An earlier error is simply returned. 219 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CD/GitOps Reference Architecture 2 | 3 | This is a logical overview of the CD/GitOps architecture in use at InfluxData, 4 | to deploy our Cloud offering on many regions in many cloud providers. 5 | 6 | We refer to this as CD/GitOps because this is a fusion of Continuous Deployment -- 7 | as in, every commit to master is delivered to production automatically -- 8 | with GitOps as popularized by Weaveworks, where a git repository is the source of truth for what is running in production. 9 | 10 | If this topic interests you and you're interested in expanding and applying these ideas: 11 | [InfluxData is hiring](https://grnh.se/ygda3s)! 12 | 13 | ## Overview 14 | 15 | There are two git repositories of interest, dubbed `app.git` and `cfg-app.git` (pronounced "config app"). 16 | 17 | `app.git` is your actual application code. 18 | In our engineering team, this is a monorepo, but few details should change if you have multiple repos. 19 | 20 | `cfg-app.git` is the repository that contains your Kubernetes configurations. 21 | Our repository is primarily Jsonnet, and we commit the generated YAML, 22 | so that we can confidently write and review Jsonnet changes without being surprised by the effects. 23 | Few details should change if you choose to use Helm, Kustomize, or comparable tools. 24 | 25 | There is a separate service, which we will call `cfgupdater` (pronounced "config updater"), 26 | that is responsible for creating automatic commits into `cfg-app`. 27 | We do not have an open source implementation of this `cfgupdater` yet, 28 | but it is described in considerable detail in [cfgupdater.md](/cfgupdater.md). 29 | 30 | We are using Argo CD to apply changes from the `cfg-app` repo into the target environments. 31 | We expect that few details would change if you were using Weaveworks Flux or the future Argo Flux product. 32 | 33 | For a graphical overview of how the pieces interact, refer to [flowchart.txt](/flowchart.txt). 34 | 35 | ## Overall Patterns 36 | 37 | We have three Waves of deployment targets, dubbed `Staging`, `Internal`, and `Production`. 38 | Each wave may contain many targets (a target Namespace or collection of Namespaces, 39 | in a particular Kubernetes cluster, in a particular cloud provider). 40 | 41 | After a set of images is **successfully** deployed to the Staging targets, 42 | those images are promoted to the Internal environment; 43 | and after those images are all successfully deployed, the images are again promoted to Production. 44 | 45 | These three waves are what we believe suits our circumstances, 46 | but the pattern could be applied to any reasonable number of waves. 47 | 48 | ## Application Repository Patterns 49 | 50 | ### Docker Images 51 | 52 | We decided that we want to build and tag Docker images for every new push to master of our application. 53 | However, we do not want to needlessly deploy services that did not have a material change. 54 | In other words, a README update should not cause a new Docker image to be built, 55 | and a modification to a common library should only result in new Docker images for services that depend on the library. 56 | 57 | We achieved this with a two-pronged approach: 58 | reproducible builds of our services, and aggressive Docker caching. 59 | 60 | #### Reproducible Builds 61 | 62 | Our application monorepo happens to be written in Go, which makes it easy to achieve reproducible builds. 63 | 64 | In general, given the same source code at the same path, Go will produce the same binary, bit-for-bit. 65 | But there are a couple details to be aware of: 66 | 67 | - Go embeds a "build ID" that differs per host. Fix it to be the empty string with `GOFLAGS='-ldflags=-buildid='`. 68 | - In Go 1.13 and newer, you can use `-trimpath` so that the source directory where you're building isn't included in the debug info. 69 | If you are using an older version of Go, just be sure that the source code is in the same absolute path on any machine building the source code. 70 | - If you are building in module mode, a module update that doesn't result in a material change can still affect the build. 71 | That is, if you upgrade module `foo` from v1.0 to v1.1, and foo/bar changes even though you don't reference that package, 72 | the debug information will differ between the two builds because of referencing `"foo@v1.0"` in one build and `"foo@v1.1"` in the next. 73 | You can avoid this problem if you build from the vendor directory, by not using modules at all, or by using `go mod vendor` and building with `-mod=vendor`. 74 | 75 | #### Aggressive Docker Caching 76 | 77 | There are likely several other valid approaches to achieve our goal 78 | of building a Docker image on every commit to the master branch of `app`, 79 | with the image digest only changing when the binary content has changed. 80 | Here's how we are solving the problem. 81 | 82 | We first build a single Docker image that contains all the binaries we will be shipping with our services. 83 | (See [`docker/Dockerfile.base`](/docker/Dockerfile.base), which refers to [`scripts/build-cd-base.bash`](/scripts/build-cd-base.bash).) 84 | Our real application builds over a dozen Go binaries, 85 | so we want to build them together to take advantage of the Go build cache. 86 | We are experimenting with Buildkit so that we can use a 87 | [cache mount](https://github.com/moby/buildkit/blob/b939973129b3d1795988e685f07a50a2afe8a401/frontend/dockerfile/docs/experimental.md#run---mounttypecache) 88 | and further speed up builds. 89 | 90 | Then, our applications' Dockerfiles use `COPY --from` to copy the binaries from the base image. 91 | We provide the base image as a build argument, 92 | so that the application's Dockerfile isn't tightly coupled to that base image. 93 | Assuming the base image produces reproducible builds, then `COPY --from` will copy the same file 94 | and produce the same Docker image -- if and only if the previous image is available on the machine building the newer image. 95 | 96 | If you have a Dockerfile that produces the same effective layers, 97 | but you build the image on two different hosts without a common cache, 98 | you will produce two different Docker images because of timestamps and other metadata in newly created layers. 99 | To avoid this, you can tell Docker build to use a specific image as a cache source, 100 | like `docker build --cache-from=docker.example.com/service:$PREV_IMAGE`. 101 | But, if you are building on an ephemeral host, 102 | you have to explicitly pull that Docker image to ensure that image is used as a cache. 103 | 104 | In our setup, we tag the Docker images both with the full SHA of the app commit and with the source branch. 105 | We considered using an abbreviated SHA, but decided on the full SHA because it is completely unambiguous. 106 | 107 | See [`scripts/cd-base.bash`](/scripts/cd-base.bash) for our shell script that we run on CI to build the base images, 108 | and [`scripts/cd-service.bash`](/scripts/cd-service.bash) for the similar shell script that we use to build the service-specific images. 109 | The main difference in the scripts is that the service script stores an artifact containing the generated image tag and digest. 110 | More on that in the section on Config Repository Patterns 111 | 112 | ## Config Repository Patterns 113 | 114 | ### Jsonnet 115 | 116 | Jsonnet was a good fit for us, starting from scratch. 117 | If you are currently using Helm or Kustomize or any other tool and you're happy with it, by all means keep using it. 118 | 119 | Rather than discussing Jsonnet in detail here, I will link to two references on real world use of it: 120 | 121 | - [Declarative Infrastructure with the Jsonnet Templating Language](https://databricks.com/blog/2017/06/26/declarative-infrastructure-jsonnet-templating-language.html) 122 | - [Google SRE Workbook, Configuration Specifics](https://landing.google.com/sre/workbook/chapters/configuration-specifics/) 123 | 124 | ### Commit the Generated YAML 125 | 126 | This is not strictly necessary, but we've decided to opt in to this pattern. 127 | 128 | It is important that we not only commit the YAML, but that we confirm in CI 129 | that the committed YAML is up to date. 130 | By doing so, we can refactor and review changes to Jsonnet with full confidence in their effect on YAML. 131 | 132 | Note that when Argo CD observes a directory, it will parse any Jsonnet 133 | and it will interpret straight Kubernetes resources in YAML. 134 | When we generate our YAML, we generate it into its own directory, 135 | to avoid Argo CD giving warnings about duplicate resource definitions. 136 | 137 | #### Regenerate YAML rather than risking merge conflicts 138 | 139 | Most of the time, you're writing config changes against master, so there is little risk of merge conflict. 140 | But every once in a while, you may have an old branch that needs to be rebased. 141 | If you are automatically rebasing commits, such as the strategy mentioned in the cfgupdater document, 142 | there will not be a human operator around to handle any merge conflicts. 143 | 144 | Luckily, it's easy to instruct git to use [a custom merge driver](https://www.git-scm.com/docs/gitattributes#_defining_a_custom_merge_driver). 145 | One simple approach looks like: 146 | 147 | ```sh 148 | git config --local merge.regenerateyaml.name 'Regenerate YAML' 149 | git config --local merge.regenerateyaml.driver 'make regenerate-single-yaml REGENERATE_YAML=%P GIT_MERGE_OUT=%A' 150 | ``` 151 | 152 | The `%P` argument is the path in the working tree of the file that had a conflict. 153 | You may overwrite that file, but git also expects you to write the "merged" result to the `%A` argument. 154 | If you don't do that, the current version of git gives a strange error like `error: add_cacheinfo failed to refresh for path`. 155 | 156 | Finally, you must set up a .gitattributes entry like: 157 | 158 | ``` 159 | /generated/*/*.yml merge=regenerateyaml 160 | ``` 161 | 162 | This tells git to use the custom merge driver you configured earlier, when handling merges on files that match that pattern. 163 | 164 | ### Accessible Entrypoints to Config Operations 165 | 166 | The primary config operation we have is regenerate the YAML after a manual Jsonnet change 167 | or after an image definition file is updated. 168 | This operation will frequently be run by humans, 169 | but machines will tend to regenerate the YAMl indirectly by way of the secondary set of operations. 170 | 171 | The secondary set of operations we have is image promotion between environments -- 172 | in our case, introducing new images to Staging, promoting images from Staging to Internal, 173 | and promoting images from Internal to Production. 174 | These operations will rarely be run by humans, frequently by machine. 175 | 176 | We have Makefile targets for these operations, which call into shell scripts. 177 | This way, our our `cfgupdater` application can be aware of just the make targets. 178 | If we ever need to refactor to something other than a shell script, 179 | the Makefile offers a layer of abstraction from those details. 180 | 181 | ### Machine-Updatable Image Definitions 182 | 183 | As mentioned in the Accessible Entrypoints section, 184 | our tooling needs to be able to introduce new images into the config repository. 185 | 186 | Every set of images that may be updated at once, is defined in its own JSON object. 187 | When we want to update that set of images, we overwrite the entire file with new values. 188 | Then our Jsonnet imports the JSON file and exposes the specific images where they are needed in our configuration objects. 189 | 190 | Because the images are delivered as a single unit, image promotion becomes a simple operation: 191 | 192 | ```sh 193 | # Example of promoting images from Acceptance to Internal. 194 | # Assumes the SHA of cfg-app.git that was successfully deployed to Acceptance is given as $DEPLOYED_SHA. 195 | git show "$DEPLOYED_SHA":images/acceptance/tags.json > images/internal/tags.json 196 | git show "$DEPLOYED_SHA":images/acceptance/digests.json > images/internal/digests.json 197 | git commit -m 'Promoted deployed images to Internal...' 198 | git push origin master 199 | ``` 200 | 201 | ### Record Both the Docker Image Tag and its Digest 202 | 203 | We intend to update the config repository for every commit to master of the application repository. 204 | But we have many services that may be updated; 205 | recording the digest of the image means that we can see, in the git diff, what services are expected to be affected by any image change. 206 | Recording the image tag, which maps to the commit SHA in the application repository, 207 | quickly indicates what source commit is currently deployed. 208 | 209 | We use the image digests in the pod specs because we know them at the time of image build and push. 210 | While a tag can be accidentally or maliciously modified, an image digest is immutable. 211 | 212 | ## cfgupdater 213 | 214 | The cfgupdater service is primarily responsibile for creating automatic commits into the cfg-app.git repository. 215 | 216 | It is implemented as an HTTP API to a [GitHub App](https://developer.github.com/apps/) 217 | that creates and pushes commits to cfg-app.git, and observes the CI status of those commits before merging the commits to master. 218 | 219 | Please refer to [flowchart.txt](/flowchart.txt) for an overview of how cfgupdater ties into the overall workflow. 220 | --------------------------------------------------------------------------------