├── docker
    ├── Dockerfile.app
    └── Dockerfile.base
├── scripts
    ├── cd-base.bash
    ├── cd-service.bash
    └── build-cd-base.bash
├── flowchart.txt
├── cfgupdater.md
└── README.md


/docker/Dockerfile.app:
--------------------------------------------------------------------------------
 1 | # ARGs for the FROM target must occur first.
 2 | # Allowing a dynamic source image allows developers to quickly build a single image,
 3 | # rather than building the full Dockerfile.cd that compiles many commands.
 4 | ARG BIN_IMAGE
 5 | FROM ${BIN_IMAGE} as bin_image
 6 | 
 7 | FROM alpine:3
 8 | 
 9 | # Add certificates first, as that may be a common layer with other alpine-based images.
10 | RUN apk add --no-cache ca-certificates
11 | 
12 | # The cmd will not change, so set it early for better layer caching.
13 | CMD ["/foo-svc"]
14 | 
15 | # Again, having a dynamic source directory allows simplified local development.
16 | ARG BIN_SRC_DIR=/app-bins
17 | COPY --from=bin_image ${BIN_SRC_DIR}/foo-svc /foo-svc
18 | 


--------------------------------------------------------------------------------
/scripts/cd-base.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -eu -o pipefail
 4 | 
 5 | # get current branch and commit sha, and force branch name to valid docker tag character set
 6 | # allow forcing branch name regardless of current banch using FORCE_CI_* variables
 7 | readonly branch_name="${FORCE_CI_BRANCH_NAME:-$(git rev-parse --abbrev-ref HEAD | tr -d '\n' | tr -c '[:alnum:].-' _)}"
 8 | readonly git_sha="${FORCE_CI_GIT_SHA:-$(git rev-parse HEAD)}"
 9 | 
10 | # pull image if it exists, ignore if it does not
11 | docker pull "docker.example.com/app-bins:${branch_name}" || true
12 | 
13 | docker build --pull \
14 |   --cache-from "docker.example.com/app-bins:${branch_name}" \
15 |   -t "docker.example.com/app-bins:${git_sha}" \
16 |   -f Dockerfile.cd .
17 | 
18 | docker tag \
19 |   "docker.example.com/app-bins:${git_sha}" \
20 |   "docker.example.com/app-bins:${branch_name}"
21 | 
22 | docker push "docker.example.com/app-bins:${git_sha}"
23 | docker push "docker.example.com/app-bins:${branch_name}"
24 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.base:
--------------------------------------------------------------------------------
 1 | # This Dockerfile produces a single Docker image that holds a whitelisted set of built binaries,
 2 | # as defined in scripts/build-cd-base.bash.
 3 | # Downstream builds will copy only the binaries they need from this base layer.
 4 | 
 5 | FROM docker.io/golang:1.12-buster as builder
 6 | 
 7 | # Workdir isn't going to change, so set it first.
 8 | # We don't need to be in a fake GOPATH anymore.
 9 | WORKDIR /app
10 | 
11 | # Copy over the shell scripts we use inside other downstream containers.
12 | # These scripts are less likely to change than the binaries we compile.
13 | COPY \
14 | 	./service/foo/do_something.sh \
15 | 	./service/bar/do_something_else.sh \
16 | 	/app-bins/
17 | 
18 | # Assuming the vendor directory is up-to-date with `go mod vendor`.
19 | COPY . /app
20 | 
21 | # Build the whitelisted binaries.
22 | RUN APP_BIN_DEST=/app-bins CLEAN_GOCACHE=1 /app/scripts/build-cd-base.bash
23 | 
24 | # Our final image is only going to hold the binaries and any shell scripts needed for other images.
25 | FROM scratch
26 | COPY --from=builder /app/ /app-bins/
27 | 


--------------------------------------------------------------------------------
/scripts/cd-service.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -eu -o pipefail
 4 | 
 5 | # service to build; strip leading "cd-app-" to avoid passing CI JOB name directly from config.yml
 6 | readonly service="$(echo "${1:?App required}" | sed 's,^cd-service-,,')"
 7 | 
 8 | # get current branch and commit sha, and force branch name to valid docker tag character set
 9 | # allow forcing branch name regardless of current branch using FORCE_CI_* variables
10 | readonly branch_name="${FORCE_CI_BRANCH_NAME:-$(git rev-parse --abbrev-ref HEAD | tr -d '\n' | tr -c '[:alnum:].-' _)}"
11 | readonly git_sha="${FORCE_CI_GIT_SHA:-$(git rev-parse HEAD)}"
12 | 
13 | # pull image if it exists, ignore if it does not
14 | docker pull "docker.example.com/app-cd-${service}:${branch_name}" || true
15 | 
16 | docker build --pull \
17 |   --cache-from "docker.example.com/app-cd-${service}:${branch_name}" \
18 |   -t "docker.example.com/app-cd-${service}:${git_sha}" \
19 |   --build-arg BIN_IMAGE=docker.example.com/app-bins:${git_sha} \
20 |   -f "apps/${service}/Dockerfile.cd" .
21 | 
22 | docker tag \
23 |   "docker.example.com/app-cd-${service}:${git_sha}" \
24 |   "docker.example.com/app-cd-${service}:${branch_name}"
25 | 
26 | docker push "docker.example.com/app-cd-${service}:${git_sha}"
27 | docker push "docker.example.com/app-cd-${service}:${branch_name}"
28 | 
29 | # Construct a JSON object for this image's Tag and Digest.
30 | # I was not able to retrieve the digest when calling docker images against a specific tag,
31 | # so get all the tags and digests for the given image and grep for the matching tag.
32 | jq --null-input --sort-keys \
33 |   --arg tagDigest "$(docker images "docker.example.com/app-cd-${service}" --format '{{.Tag}} {{.Digest}}' | grep "^${git_sha}")" \
34 |   --arg imgPrefix "docker.example.com/app-cd-$service" \
35 |   --arg serviceKey "$SERVICEKEY" \
36 |   '$tagDigest | split(" ") as $td | {
37 |     ($serviceKey): {
38 |       Tag: ($imgPrefix + ":" + $td[0]),
39 |       Digest: ($imgPrefix + "@" + $td[1]),
40 |     }
41 |   }' | tee "/artifacts/$SERVICEKEY.json"
42 | 


--------------------------------------------------------------------------------
/scripts/build-cd-base.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script is called from Dockerfile.cd to build all of the necessary commands in one base image,
 4 | # so that we amortize compile costs.
 5 | # Then, downstream images can simply copy their binary from this base image.
 6 | 
 7 | set -eux
 8 | 
 9 | if [ -z "$APP_BIN_DEST" ]; then
10 | 	# Requiring the output as an explicit argument,
11 | 	# so the script can be run in container or on workstation.
12 | 	>&2 echo '$APP_BIN_DEST must be set as destination directory for built binaries.'
13 | 	exit 1
14 | fi
15 | 
16 | # Sanity checks.
17 | if [ ! -f go.mod ]; then
18 | 	>&2 echo 'This script must be run from the root of the app repository.'
19 | 	exit 1
20 | fi
21 | modheader="$(head -n 1 go.mod)"
22 | if [ "$modheader" != 'module example.com/app' ]; then
23 | 	>&2 echo 'go.mod detected, but it does not appear to be the app module.'
24 | 	exit 1
25 | fi
26 | 
27 | # All of the commands we are going to build, as a bash array.
28 | # There are some that we will never build, so we whitelist required builds instead of using ./cmd/... .
29 | cmds=(
30 | 	foo
31 | 	bar
32 | 	baz
33 | )
34 | 
35 | # Common go flags to the upcoming go build commands.
36 | export CGO_ENABLED=0 GOOS=linux GO111MODULE=on
37 | 
38 | # GOFLAGS is special.
39 | # We always want -mod=vendor in this script.
40 | # We want to hardcode an empty buildid so that builds are bit-for-bit identical.
41 | # See https://github.com/golang/go/issues/33772.
42 | # After upgrading to Go 1.13, it will probably make sense to add -trimpath.
43 | export GOFLAGS='-mod=vendor -ldflags=-buildid='
44 | 
45 | # Helpful log output to see the exact version of Go used.
46 | go version
47 | 
48 | # Build ./cmd/foo for foo in $cmds (https://stackoverflow.com/a/12744170).
49 | # And build it as one command, to maximize concurrency.
50 | # When we switch to Go 1.13, we can use "-o $dir" to emit all the executables,
51 | # but until then, we build once to warm the cache and then emit the individual binaries.
52 | # Run with time just for simple insight on how long this takes.
53 | time go build "${cmds[@]/#/./cmd/}"
54 | 
55 | # Again, this can be refactored away after we move to Go 1.13.
56 | for cmd in ${cmds[@]}; do
57 | 	time go build -o "$APP_BIN_DEST/$cmd" "./cmd/$cmd"
58 | done
59 | 
60 | # If you run this script on your workstation, you probably don't want to blow away your go cache.
61 | # But if you run it in a Docker container, the cache just wastes disk space / image size.
62 | # (Variable expansion with defaulting, to avoid an unset variable error.)
63 | if [ -n "${CLEAN_GOCACHE:-}" ]; then
64 | 	go clean -cache
65 | fi
66 | 


--------------------------------------------------------------------------------
/flowchart.txt:
--------------------------------------------------------------------------------
  1 |           ┌───────────────────────────┐
  2 |           │                           │
  3 |           │          app.git          │
  4 |           │                           │
  5 |           └───────────────────────────┘
  6 |                         │
  7 |                         │
  8 |                      Master
  9 |                    branch is
 10 |                     updated
 11 |                         │
 12 |                         ▼
 13 |          ┌────────────────────────────┐
 14 |          │ CI pipeline builds Docker  │
 15 |          │images for every service in │
 16 |          │      the application       │
 17 |          └────────────────────────────┘
 18 |                         │
 19 |            CI posts new image tags and
 20 |               digests to cfgupdater
 21 |                         │
 22 |                         │
 23 |                         ▼
 24 |      ┌────────────────────────────────────┐
 25 |      │ cfgupdater creates a new commit in │
 26 |      │  cfg-app to update Staging images  │
 27 |      │   to the most recent app commit    │
 28 |      │                                    │
 29 |      └────────────────────────────────────┘   ┌─────────────────────┐
 30 |                         │                     │ Developer directly  │
 31 |                         │                     │ modifies Jsonnet to │
 32 |                 Effectively, git   ┌──────────│  update Kubernetes  │
 33 |                       push         │          │     resource(s)     │
 34 |                         │          │          └─────────────────────┘
 35 |                         ▼          ▼
 36 |         ┌───────────────────────────────┐
 37 |         │          cfg-app.git          │
 38 |         │                               ├─────────────────┐
 39 | ┌──────▶│(Jsonnet to generate k8s YAML) │                 │
 40 | │ ┌────▶│                               ├───────┐         │
 41 | │ │     └───────────────────────────────┘       │         │
 42 | │ │                     │                       │         │
 43 | │ │                     │                       │         │
 44 | │ │         Argo observes cfg-app.git           │         │
 45 | │ │           update on Staging env             │         │
 46 | │ │                  config                     │         │
 47 | │ │                     │                       │         │
 48 | │ │                     ▼                       │         │
 49 | │ │     ┌──────────────────────────────┐        │         │
 50 | │ │     │  Argo CD deploys to Staging  │        │         │
 51 | │ │     │             envs             │        │         │
 52 | │ │     │                              │  Argo observes   │
 53 | │ │     └──────────────────────────────┘   cfg-app.git    │
 54 | │ │                     │                   update on     │
 55 | │ │                     │                 Internal env    │
 56 | │ │         Argo CD post-deploy hook         config       │
 57 | │ │       indicating successful deploy          │         │
 58 | │ │                     │                       │         │
 59 | │ │                     │                       │         │
 60 | │ │                     ▼                       │         │
 61 | │ │  ┌─────────────────────────────────────┐    │         │
 62 | │ │  │                                     │    │         │
 63 | │ │  │ cfgupdater creates a new commit in  │    │         │
 64 | │ │  │    cfg-app, promoting the images    │    │         │
 65 | │ └──│  successfully deployed to Staging,  │    │   Argo observes
 66 | │    │     into Internal environments      │    │    cfg-app.git
 67 | │    │                                     │    │     update on
 68 | │    └─────────────────────────────────────┘    │  Production env
 69 | │                                               │      config
 70 | │                                               │         │
 71 | │                                               │         │
 72 | │        ┌──────────────────────────────┐       │         │
 73 | │        │ Argo CD deploys to Internal  │       │         │
 74 | │        │             envs             │◀──────┘         │
 75 | │        └──────────────────────────────┘                 │
 76 | │                        │                                │
 77 | │               Argo CD post-deploy                       │
 78 | │                 hook indicating                         │
 79 | │                successful deploy                        │
 80 | │                        │                                │
 81 | │                        │                                │
 82 | │                        ▼                                │
 83 | │     ┌─────────────────────────────────────┐             │
 84 | │     │                                     │             │
 85 | │     │ cfgupdater creates a new commit in  │             │
 86 | │     │    cfg-app, promoting the images    │             │
 87 | └─────│ successfully deployed to Internal,  │             │
 88 |       │    into Production environments     │             │
 89 |       │                                     │             │
 90 |       └─────────────────────────────────────┘             │
 91 |                                                           │
 92 |                                                           │
 93 |           ┌──────────────────────────────┐                │
 94 |           │Argo CD deploys to Production │                │
 95 |           │         environments         │◀───────────────┘
 96 |           │                              │
 97 |           └──────────────────────────────┘
 98 | 
 99 | Created with Monodraw
100 | 


--------------------------------------------------------------------------------
/cfgupdater.md:
--------------------------------------------------------------------------------
  1 | # cfgupdater
  2 | 
  3 | This document describes the cfgupdater service in detail.
  4 | Refer to [README.md](/README.md) for an overview of the overall patterns InfluxData uses for CD/GitOps.
  5 | 
  6 | ## Overview
  7 | 
  8 | cfgupdater is implemented as a [GitHub App](https://developer.github.com/apps/).
  9 | The overall patterns would generally be the same if you were writing a service to interact with another source control provider.
 10 | 
 11 | The API endpoints look like:
 12 | 
 13 | - One endpoint to be triggered from `app`'s CI pipeline to announce a new set of images
 14 | - One set of endpoints to be triggered from Argo CD post sync hooks, to handle promoting images between environment sets
 15 | - One endpoint to receive GitHub webhooks for commit status updates on the cfg-app repository.
 16 | 
 17 | You will notice that the first two endpoints map to the "accessible entrypoints" of the config repository.
 18 | The third endpoint is so that we follow the 
 19 | ["Not Rocket Science Rule of Software Engineering"](https://graydon.livejournal.com/186550.html)
 20 | by pushing automatic commits to a branch that is rebased on master,
 21 | and master is always fast-forwarded to a commit that has already passed CI.
 22 | 
 23 | ## General Operation
 24 | 
 25 | We will assume that `master` is the branch of `cfg-app.git` that Argo CD observes,
 26 | and `auto` is the arbitrary branch that cfgupdater uses for its automatic commits.
 27 | 
 28 | cfgupdater creates commits on the auto branch and attempts to keep auto rebased on master.
 29 | Once CI reports a commit on the auto branch has passed,
 30 | cfgupdater can fast-forward the master branch to that commit.
 31 | 
 32 | The automatic commits have [trailers](https://git-scm.com/docs/git-interpret-trailers)
 33 | which are easily machine-parsed for fuller audit details.
 34 | 
 35 | ## Implementation Details
 36 | 
 37 | The code for our cfgupdater implementation is not ready to share,
 38 | but I can share the fine details on exactly what git operations we take at each stage of the process.
 39 | 
 40 | The details are outlined as rough, untested shell scripts.
 41 | For any functions that start with `helper_`, see the Helpers section at the bottom of this document.
 42 | 
 43 | Assume that all scripts run in the root directory of the `cfg-app.git` working tree,
 44 | and that all scripts are run effectively as `set -euo pipefail`.
 45 | 
 46 | ### Startup
 47 | 
 48 | The process clones the full `cfg-app.git` repository.
 49 | If the auto branch does not exist, it is created at the current commit at HEAD of master.
 50 | 
 51 | ### New Images Published
 52 | 
 53 | When `app`'s CI pipeline finishes, it publishes a JSON object to cfgupdater that looks like:
 54 | 
 55 | ```json
 56 | {
 57 |   "Service1": {
 58 |     "Digest": "docker.example.com/service1@sha256:6a9ca693f6fff83215c00b653bcf2106124705ad538dc509373523fdd6cefdb4",
 59 |     "Tag": "docker.example.com/service1:7d1043473d55bfa90e8530d35801d4e381bc69f0"
 60 |   },
 61 |   "Service2": {
 62 |     "Digest": "docker.example.com/service2@sha256:621f0ce9f70ad34dcc76d4b28c0e16ff30afa7f0318ec9ed85f9979255006a65",
 63 |     "Tag": "docker.example.com/service2:7d1043473d55bfa90e8530d35801d4e381bc69f0"
 64 |   }
 65 | }
 66 | ```
 67 | 
 68 | Assumptions:
 69 | - cfgupdater saves that JSON input as a file `/tmp/images.json`.
 70 | - The full application SHA is available as `$APP_SHA`.
 71 | 
 72 | Then cfgupdater effectively runs:
 73 | 
 74 | ```sh
 75 | helper_align_auto || helper_refresh
 76 | 
 77 | # Detach from the auto branch so that we don't have persist changes back to the branch
 78 | # until everything is done.
 79 | git checkout --detach auto || helper_refresh
 80 | 
 81 | # Apply the new images to the staging environment, and regenerate the YAML.
 82 | make introduce_images IMAGE_FILE=/tmp/images.json
 83 | 
 84 | # Format the commit message, and use commit -a to commit all changes to updated files.
 85 | git commit -a -m "chore: update app to ${APP_SHA:0:10}
 86 | 
 87 | Autocommit-App-SHA: $APP_SHA
 88 | Autocommit-Target: staging
 89 | Autocommit-Reason: new images published
 90 | "
 91 | 
 92 | # Set the branch back now that the commit is finalized.
 93 | git checkout -B auto HEAD
 94 | 
 95 | git push origin auto:auto
 96 | ```
 97 | 
 98 | If you are concerned that your CI pipeline may replay a build,
 99 | this stage could see if the image has been published before by inspecting the output of
100 | `git log --format='%(trailers:key=Autocommit-App-SHA,valueonly)' auto`.
101 | However, note that this format style [requires git 2.22](https://github.com/git/git/blob/7a6a90c6ec48fc78c83d7090d6c1b95d8f3739c0/Documentation/RelNotes/2.22.0.txt#L21-L23) or newer,
102 | and as of this writing, [the newest git you can apt-get install on Debian Buster or Stretch, even with backports, is git 2.20](https://unix.stackexchange.com/q/559437).
103 | 
104 | ### CI Status Reported
105 | 
106 | We subscribe to GitHub webhook events for the `cfg-app.git` repository.
107 | 
108 | When GitHub reports the updated status for a commit, it includes the names of the branches that include the commit.
109 | Pending statuses are ignored.
110 | We only care about a particular set of status checks;
111 | if they all pass, we can fast-forward master master to that commit,
112 | or if any fail, we "evict" that commit from the auto branch.
113 | 
114 | In these flows we rely heavily on `git merge-base --is-ancestor` to check that commits are ordered as expectedc,
115 | so you may want to read [its documentation](https://git-scm.com/docs/git-merge-base)
116 | if you are unfamiliar with the command.
117 | 
118 | #### All Status Checks Passed
119 | 
120 | When all internally required status checks pass, we fast-forward master to that new commit.
121 | Note that since GitHub reports a single status at a time, you may need to make a separate API call to GitHub to check whether the other required status checks have passed.
122 | 
123 | Assumptions:
124 | - The commit whose status passed is available as `$GREEN_SHA`.
125 | 
126 | ```sh
127 | helper_align_auto || helper_refresh
128 | 
129 | # The commit must be on the auto branch.
130 | git merge-base --is-ancestor $GREEN_SHA auto || helper_refresh
131 | 
132 | # And ensure master will be fast-forwarded the new commit.
133 | # Note that if master is already on this commit, the command will still succeed.
134 | # This is fine, as the later merge attempt and push will be a no-op.
135 | git merge-base --is-ancestor master $GREEN_SHA || helper_refresh
136 | 
137 | git checkout master
138 | git merge --ff-only $GREEN_SHA
139 | git push origin master:master || helper_refresh
140 | ```
141 | 
142 | #### Any Status Check Failed
143 | 
144 | We need to rebase away the commit whose status failed.
145 | 
146 | Assumptions:
147 | - The commit whose status failed is available as `$RED_SHA`.
148 | 
149 | ```sh
150 | helper_align_auto || helper_refresh
151 | 
152 | # $RED_SHA must be an ancestor of auto, and master must be an ancestor of $RED_SHA.
153 | git merge-base --is-ancestor $RED_SHA auto || helper_refresh
154 | git merge-base --is-ancestor master $RED_SHA || helper_refresh
155 | 
156 | # git merge-base --is-ancestor x y will exit 0 if x and y point at the same commit;
157 | # so as one last sanity check, make sure master's commit isn't the same as $RED_SHA.
158 | test "$(git rev-parse --verify master)" != "$RED_SHA" || helper_refresh
159 | 
160 | ORIG_AUTO_SHA="$(git rev-parse auto)"
161 | 
162 | # Rebase away the actual commit.
163 | git rebase --onto "${RED_SHA}^" "$RED_SHA" auto || helper_refresh
164 | 
165 | # Force-push with lease our new auto ref.
166 | git push --force-with-lease=auto:"$ORIG_AUTO_SHA" origin auto:auto || helper_refresh
167 | ```
168 | 
169 | ### Helpers
170 | 
171 | Here are the details on the helpers referenced in the above implementation details.
172 | 
173 | #### `helper_align_auto`
174 | 
175 | The rebase at the end of `helper_align_auto` is likely the most brittle part of this git automation.
176 | 
177 | One potentially more intelligent solution would inspect the trailers on the commits in the auto branch,
178 | and then "replay" those actions as new commits on master.
179 | 
180 | For now, we are using the rebase strategy, but we are specifically regenerating YAML,
181 | as detailed in the README, rather than allowing the possibility of merge conflicts in those generated files.
182 | Before rebasing, we run the appropriate `git config` commands to configure the custom merge driver.
183 | 
184 | ```sh
185 | MASTER_SHA="$(git rev-parse master)"
186 | AUTO_SHA="$(git rev-parse auto)"
187 | 
188 | if [ "$MASTER_SHA" == "$AUTO_SHA" ]; then
189 |   # Branches are aligned. Nothing to do.
190 |   exit 0
191 | fi
192 | 
193 | if git merge-base --is-ancestor "$AUTO_SHA" "$MASTER_SHA"; then
194 |   # auto is an ancestor of master. Maybe someone pushed directly to master.
195 |   # Locally reset the auto branch to match master.
196 |   git branch -f auto master
197 |   exit 0
198 | fi
199 | 
200 | if git merge-base --is-ancestor "$MASTER_SHA" "$AUTO_SHA"; then
201 |   # master is an ancestor of auto. That just means auto has advanced past master. This is fine.
202 |   exit 0
203 | fi
204 | 
205 | # At this point, master is not an ancestor of auto, nor vice versa.
206 | # Try to rebase auto on master.
207 | MERGE_BASE="$(git merge-base "$MASTER_SHA" "$AUTO_SHA")"
208 | git rebase "$MASTER_SHA" auto
209 | git push --force-with-lease=auto:"$AUTO_SHA" origin auto:auto
210 | ```
211 | 
212 | #### `helper_refresh`
213 | 
214 | This helper is run when a git command has failed, and it optimistically retries the entire script
215 | after creating a fresh clone of the application git repository.
216 | 
217 | During the second run of the script, calls to `helper_refresh` have no effect.
218 | An earlier error is simply returned.
219 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CD/GitOps Reference Architecture
  2 | 
  3 | This is a logical overview of the CD/GitOps architecture in use at InfluxData,
  4 | to deploy our Cloud offering on many regions in many cloud providers.
  5 | 
  6 | We refer to this as CD/GitOps because this is a fusion of Continuous Deployment --
  7 | as in, every commit to master is delivered to production automatically --
  8 | with GitOps as popularized by Weaveworks, where a git repository is the source of truth for what is running in production.
  9 | 
 10 | If this topic interests you and you're interested in expanding and applying these ideas:
 11 | [InfluxData is hiring](https://grnh.se/ygda3s)!
 12 | 
 13 | ## Overview
 14 | 
 15 | There are two git repositories of interest, dubbed `app.git` and `cfg-app.git` (pronounced "config app").
 16 | 
 17 | `app.git` is your actual application code.
 18 | In our engineering team, this is a monorepo, but few details should change if you have multiple repos.
 19 | 
 20 | `cfg-app.git` is the repository that contains your Kubernetes configurations.
 21 | Our repository is primarily Jsonnet, and we commit the generated YAML,
 22 | so that we can confidently write and review Jsonnet changes without being surprised by the effects.
 23 | Few details should change if you choose to use Helm, Kustomize, or comparable tools.
 24 | 
 25 | There is a separate service, which we will call `cfgupdater` (pronounced "config updater"),
 26 | that is responsible for creating automatic commits into `cfg-app`.
 27 | We do not have an open source implementation of this `cfgupdater` yet,
 28 | but it is described in considerable detail in [cfgupdater.md](/cfgupdater.md).
 29 | 
 30 | We are using Argo CD to apply changes from the `cfg-app` repo into the target environments.
 31 | We expect that few details would change if you were using Weaveworks Flux or the future Argo Flux product.
 32 | 
 33 | For a graphical overview of how the pieces interact, refer to [flowchart.txt](/flowchart.txt).
 34 | 
 35 | ## Overall Patterns
 36 | 
 37 | We have three Waves of deployment targets, dubbed `Staging`, `Internal`, and `Production`.
 38 | Each wave may contain many targets (a target Namespace or collection of Namespaces,
 39 | in a particular Kubernetes cluster, in a particular cloud provider).
 40 | 
 41 | After a set of images is **successfully** deployed to the Staging targets,
 42 | those images are promoted to the Internal environment;
 43 | and after those images are all successfully deployed, the images are again promoted to Production.
 44 | 
 45 | These three waves are what we believe suits our circumstances,
 46 | but the pattern could be applied to any reasonable number of waves.
 47 | 
 48 | ## Application Repository Patterns
 49 | 
 50 | ### Docker Images
 51 | 
 52 | We decided that we want to build and tag Docker images for every new push to master of our application.
 53 | However, we do not want to needlessly deploy services that did not have a material change.
 54 | In other words, a README update should not cause a new Docker image to be built,
 55 | and a modification to a common library should only result in new Docker images for services that depend on the library.
 56 | 
 57 | We achieved this with a two-pronged approach:
 58 | reproducible builds of our services, and aggressive Docker caching.
 59 | 
 60 | #### Reproducible Builds
 61 | 
 62 | Our application monorepo happens to be written in Go, which makes it easy to achieve reproducible builds.
 63 | 
 64 | In general, given the same source code at the same path, Go will produce the same binary, bit-for-bit.
 65 | But there are a couple details to be aware of:
 66 | 
 67 | - Go embeds a "build ID" that differs per host. Fix it to be the empty string with `GOFLAGS='-ldflags=-buildid='`.
 68 | - In Go 1.13 and newer, you can use `-trimpath` so that the source directory where you're building isn't included in the debug info.
 69 |     If you are using an older version of Go, just be sure that the source code is in the same absolute path on any machine building the source code.
 70 | - If you are building in module mode, a module update that doesn't result in a material change can still affect the build.
 71 |     That is, if you upgrade module `foo` from v1.0 to v1.1, and foo/bar changes even though you don't reference that package,
 72 |     the debug information will differ between the two builds because of referencing `"foo@v1.0"` in one build and `"foo@v1.1"` in the next.
 73 |     You can avoid this problem if you build from the vendor directory, by not using modules at all, or by using `go mod vendor` and building with `-mod=vendor`.
 74 | 
 75 | #### Aggressive Docker Caching
 76 | 
 77 | There are likely several other valid approaches to achieve our goal
 78 | of building a Docker image on every commit to the master branch of `app`,
 79 | with the image digest only changing when the binary content has changed.
 80 | Here's how we are solving the problem.
 81 | 
 82 | We first build a single Docker image that contains all the binaries we will be shipping with our services.
 83 | (See [`docker/Dockerfile.base`](/docker/Dockerfile.base), which refers to [`scripts/build-cd-base.bash`](/scripts/build-cd-base.bash).)
 84 | Our real application builds over a dozen Go binaries,
 85 | so we want to build them together to take advantage of the Go build cache.
 86 | We are experimenting with Buildkit so that we can use a
 87 | [cache mount](https://github.com/moby/buildkit/blob/b939973129b3d1795988e685f07a50a2afe8a401/frontend/dockerfile/docs/experimental.md#run---mounttypecache)
 88 | and further speed up builds.
 89 | 
 90 | Then, our applications' Dockerfiles use `COPY --from` to copy the binaries from the base image.
 91 | We provide the base image as a build argument,
 92 | so that the application's Dockerfile isn't tightly coupled to that base image.
 93 | Assuming the base image produces reproducible builds, then `COPY --from` will copy the same file
 94 | and produce the same Docker image -- if and only if the previous image is available on the machine building the newer image.
 95 | 
 96 | If you have a Dockerfile that produces the same effective layers,
 97 | but you build the image on two different hosts without a common cache,
 98 | you will produce two different Docker images because of timestamps and other metadata in newly created layers.
 99 | To avoid this, you can tell Docker build to use a specific image as a cache source,
100 | like `docker build --cache-from=docker.example.com/service:$PREV_IMAGE`.
101 | But, if you are building on an ephemeral host,
102 | you have to explicitly pull that Docker image to ensure that image is used as a cache.
103 | 
104 | In our setup, we tag the Docker images both with the full SHA of the app commit and with the source branch.
105 | We considered using an abbreviated SHA, but decided on the full SHA because it is completely unambiguous.
106 | 
107 | See [`scripts/cd-base.bash`](/scripts/cd-base.bash) for our shell script that we run on CI to build the base images,
108 | and [`scripts/cd-service.bash`](/scripts/cd-service.bash) for the similar shell script that we use to build the service-specific images.
109 | The main difference in the scripts is that the service script stores an artifact containing the generated image tag and digest.
110 | More on that in the section on Config Repository Patterns
111 | 
112 | ## Config Repository Patterns
113 | 
114 | ### Jsonnet
115 | 
116 | Jsonnet was a good fit for us, starting from scratch.
117 | If you are currently using Helm or Kustomize or any other tool and you're happy with it, by all means keep using it.
118 | 
119 | Rather than discussing Jsonnet in detail here, I will link to two references on real world use of it:
120 | 
121 | - [Declarative Infrastructure with the Jsonnet Templating Language](https://databricks.com/blog/2017/06/26/declarative-infrastructure-jsonnet-templating-language.html)
122 | - [Google SRE Workbook, Configuration Specifics](https://landing.google.com/sre/workbook/chapters/configuration-specifics/)
123 | 
124 | ### Commit the Generated YAML
125 | 
126 | This is not strictly necessary, but we've decided to opt in to this pattern.
127 | 
128 | It is important that we not only commit the YAML, but that we confirm in CI
129 | that the committed YAML is up to date.
130 | By doing so, we can refactor and review changes to Jsonnet with full confidence in their effect on YAML.
131 | 
132 | Note that when Argo CD observes a directory, it will parse any Jsonnet
133 | and it will interpret straight Kubernetes resources in YAML.
134 | When we generate our YAML, we generate it into its own directory,
135 | to avoid Argo CD giving warnings about duplicate resource definitions.
136 | 
137 | #### Regenerate YAML rather than risking merge conflicts
138 | 
139 | Most of the time, you're writing config changes against master, so there is little risk of merge conflict.
140 | But every once in a while, you may have an old branch that needs to be rebased.
141 | If you are automatically rebasing commits, such as the strategy mentioned in the cfgupdater document,
142 | there will not be a human operator around to handle any merge conflicts.
143 | 
144 | Luckily, it's easy to instruct git to use [a custom merge driver](https://www.git-scm.com/docs/gitattributes#_defining_a_custom_merge_driver).
145 | One simple approach looks like:
146 | 
147 | ```sh
148 | git config --local merge.regenerateyaml.name 'Regenerate YAML'
149 | git config --local merge.regenerateyaml.driver 'make regenerate-single-yaml REGENERATE_YAML=%P GIT_MERGE_OUT=%A'
150 | ```
151 | 
152 | The `%P` argument is the path in the working tree of the file that had a conflict.
153 | You may overwrite that file, but git also expects you to write the "merged" result to the `%A` argument.
154 | If you don't do that, the current version of git gives a strange error like `error: add_cacheinfo failed to refresh for path`.
155 | 
156 | Finally, you must set up a .gitattributes entry like:
157 | 
158 | ```
159 | /generated/*/*.yml merge=regenerateyaml
160 | ```
161 | 
162 | This tells git to use the custom merge driver you configured earlier, when handling merges on files that match that pattern.
163 | 
164 | ### Accessible Entrypoints to Config Operations
165 | 
166 | The primary config operation we have is regenerate the YAML after a manual Jsonnet change
167 | or after an image definition file is updated.
168 | This operation will frequently be run by humans,
169 | but machines will tend to regenerate the YAMl indirectly by way of the secondary set of operations.
170 | 
171 | The secondary set of operations we have is image promotion between environments --
172 | in our case, introducing new images to Staging, promoting images from Staging to Internal,
173 | and promoting images from Internal to Production.
174 | These operations will rarely be run by humans, frequently by machine.
175 | 
176 | We have Makefile targets for these operations, which call into shell scripts.
177 | This way, our our `cfgupdater` application can be aware of just the make targets.
178 | If we ever need to refactor to something other than a shell script,
179 | the Makefile offers a layer of abstraction from those details.
180 | 
181 | ### Machine-Updatable Image Definitions
182 | 
183 | As mentioned in the Accessible Entrypoints section,
184 | our tooling needs to be able to introduce new images into the config repository.
185 | 
186 | Every set of images that may be updated at once, is defined in its own JSON object.
187 | When we want to update that set of images, we overwrite the entire file with new values.
188 | Then our Jsonnet imports the JSON file and exposes the specific images where they are needed in our configuration objects.
189 | 
190 | Because the images are delivered as a single unit, image promotion becomes a simple operation:
191 | 
192 | ```sh
193 | # Example of promoting images from Acceptance to Internal.
194 | # Assumes the SHA of cfg-app.git that was successfully deployed to Acceptance is given as $DEPLOYED_SHA.
195 | git show "$DEPLOYED_SHA":images/acceptance/tags.json > images/internal/tags.json
196 | git show "$DEPLOYED_SHA":images/acceptance/digests.json > images/internal/digests.json
197 | git commit -m 'Promoted deployed images to Internal...'
198 | git push origin master
199 | ```
200 | 
201 | ### Record Both the Docker Image Tag and its Digest
202 | 
203 | We intend to update the config repository for every commit to master of the application repository.
204 | But we have many services that may be updated;
205 | recording the digest of the image means that we can see, in the git diff, what services are expected to be affected by any image change.
206 | Recording the image tag, which maps to the commit SHA in the application repository,
207 | quickly indicates what source commit is currently deployed.
208 | 
209 | We use the image digests in the pod specs because we know them at the time of image build and push.
210 | While a tag can be accidentally or maliciously modified, an image digest is immutable.
211 | 
212 | ## cfgupdater
213 | 
214 | The cfgupdater service is primarily responsibile for creating automatic commits into the cfg-app.git repository.
215 | 
216 | It is implemented as an HTTP API to a [GitHub App](https://developer.github.com/apps/)
217 | that creates and pushes commits to cfg-app.git, and observes the CI status of those commits before merging the commits to master.
218 | 
219 | Please refer to [flowchart.txt](/flowchart.txt) for an overview of how cfgupdater ties into the overall workflow.
220 | 


--------------------------------------------------------------------------------