├── .gitignore
├── np-allow-metadata.yaml
├── np-deny-metadata.yaml
├── pod-security-policy.yaml
├── np-back-to-db.yaml
├── np-default-deny.yaml
├── network-policy.yaml
├── proxy-example.yaml
├── ingress.yaml
├── pod.yaml
├── np-front-to-back.yaml
├── opa-deny-all.yaml
├── opa-ns.yaml
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | manifests/
2 | scratch.yaml
3 | NOTES.md
4 | 


--------------------------------------------------------------------------------
/np-allow-metadata.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.k8s.io/v1
 2 | kind: NetworkPolicy
 3 | metadata:
 4 |   name: allow-metadata-access
 5 |   namespace: default
 6 | spec:
 7 |   podSelector:
 8 |     matchLabels:
 9 |       role: metadata-accessor
10 |   policyTypes:
11 |     - Egress
12 |   egress:
13 |     - to:
14 |         - ipBlock:
15 |             cidr: 169.254.169.254/32
16 | 


--------------------------------------------------------------------------------
/np-deny-metadata.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.k8s.io/v1
 2 | kind: NetworkPolicy
 3 | metadata:
 4 |   name: deny-metadata-access
 5 |   namespace: default
 6 | spec:
 7 |   podSelector: {}
 8 |   policyTypes:
 9 |     - Egress
10 |   egress:
11 |     - to:
12 |         - ipBlock:
13 |             cidr: 0.0.0.0/0
14 |             except:
15 |               - 169.254.169.254/32
16 | 


--------------------------------------------------------------------------------
/pod-security-policy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: policy/v1beta1
 2 | kind: PodSecurityPolicy
 3 | metadata:
 4 |   name: example-psp
 5 | spec:
 6 |   allowPrivilegeEscalation: false
 7 |   privileged: false # Don't allow privileged pods!
 8 |   # The rest fills in some required fields.
 9 |   seLinux:
10 |     rule: RunAsAny
11 |   supplementalGroups:
12 |     rule: RunAsAny
13 |   runAsUser:
14 |     rule: RunAsAny
15 |   fsGroup:
16 |     rule: RunAsAny
17 |   volumes:
18 |     - '*'
19 | 


--------------------------------------------------------------------------------
/np-back-to-db.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.k8s.io/v1
 2 | kind: NetworkPolicy
 3 | metadata:
 4 |   name: pod-db-backend-ingress
 5 |   namespace: cassandra
 6 | spec:
 7 |   podSelector:
 8 |     matchLabels:
 9 |       run: db
10 |   policyTypes:
11 |     - Ingress
12 |   ingress:
13 |     - from:
14 |         - namespaceSelector:
15 |             matchLabels:
16 |               id: default
17 |         - podSelector:
18 |             matchLabels:
19 |               run: backend
20 | 


--------------------------------------------------------------------------------
/np-default-deny.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.k8s.io/v1
 2 | kind: NetworkPolicy
 3 | metadata:
 4 |   name: default-deny-all
 5 |   namespace: default
 6 | spec:
 7 |   podSelector: {}
 8 |   policyTypes:
 9 |     - Ingress
10 |     - Egress
11 | ---
12 | apiVersion: networking.k8s.io/v1
13 | kind: NetworkPolicy
14 | metadata:
15 |   name: default-deny-all-cassandra
16 |   namespace: cassandra
17 | spec:
18 |   podSelector: {}
19 |   policyTypes:
20 |     - Ingress
21 |     - Egress
22 | 


--------------------------------------------------------------------------------
/network-policy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: NetworkPolicy
 3 | metadata:
 4 |   name: my-network-policy
 5 |   namespace: default
 6 | spec:
 7 |   podSelector:
 8 |     matchLabels:
 9 |       id: frontend
10 |   policyTypes:
11 |     - Engress
12 |   egress:
13 |     - to:
14 |         - namespaceSelector:
15 |             matchLabels:
16 |               id: ns1
17 |       ports:
18 |         - protocol: TCP
19 |           port: 80
20 |     - to:
21 |         - podSelector:
22 |             matchLabels:
23 |               id: backend
24 | 


--------------------------------------------------------------------------------
/proxy-example.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   labels:
 5 |     run: app
 6 |   name: app
 7 | spec:
 8 |   containers:
 9 |     - command:
10 |         - sh
11 |         - -c
12 |         - ping google.com
13 |       image: bash
14 |       name: app
15 |       resources: {}
16 |     - name: proxy
17 |       image: ubuntu
18 |       command:
19 |         - sh
20 |         - -c
21 |         - 'apt-get update && apt-get install iptables -y && iptables -L && sleep 1d'
22 |       securityContext:
23 |         capabilities:
24 |           add: ['NET_ADMIN']
25 |   dnsPolicy: ClusterFirst
26 |   restartPolicy: Always
27 | 


--------------------------------------------------------------------------------
/ingress.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.k8s.io/v1
 2 | kind: Ingress
 3 | metadata:
 4 |   name: minimal-ingress
 5 |   annotations:
 6 |     #kubernetes.io/ingress-class: nginx
 7 |     nginx.ingress.kubernetes.io/rewrite-target: /
 8 | spec:
 9 |   rules:
10 |     - http:
11 |         paths:
12 |           - path: /frontend
13 |             backend:
14 |               service:
15 |                 name: frontend
16 |                 port:
17 |                   number: 80
18 |           - path: /backend
19 |             backend:
20 |               service:
21 |                 name: backend
22 |                 port:
23 |                   number: 80
24 | 


--------------------------------------------------------------------------------
/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   labels:
 5 |     run: pod1
 6 |   name: pod1
 7 | spec:
 8 |   containers:
 9 |     - image: nginx
10 |       env:
11 |         - name: FOO
12 |           valueFrom:
13 |             secretKeyRef:
14 |               name: secret1
15 |               key: something
16 |       name: pod1
17 |       volumeMounts:
18 |         - name: secret1
19 |           mountPath: '/etc/foo'
20 |           readOnly: true
21 |       resources:
22 |         cpu:
23 |           request:
24 |           limit:
25 |         memory:
26 |           request:
27 |           limit:
28 |   volumes:
29 |     - name: secret1
30 |       secret:
31 |         secretName: secret1
32 |   dnsPolicy: ClusterFirst
33 |   restartPolicy: Always
34 | 


--------------------------------------------------------------------------------
/np-front-to-back.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: networking.k8s.io/v1
 2 | kind: NetworkPolicy
 3 | metadata:
 4 |   name: pod-frontend
 5 |   namespace: default
 6 | spec:
 7 |   podSelector:
 8 |     matchLabels:
 9 |       run: frontend
10 |   policyTypes:
11 |     - Egress
12 |   egress:
13 |     - to:
14 |         - podSelector:
15 |             matchLabels:
16 |               run: backend
17 | ---
18 | apiVersion: networking.k8s.io/v1
19 | kind: NetworkPolicy
20 | metadata:
21 |   name: pod-backend
22 |   namespace: default
23 | spec:
24 |   podSelector:
25 |     matchLabels:
26 |       run: backend
27 |   policyTypes:
28 |     - Ingress
29 |     - Egress
30 |   ingress:
31 |     - from:
32 |         - podSelector:
33 |             matchLabels:
34 |               run: frontend
35 |   egress:
36 |     - to:
37 |         - namespaceSelector:
38 |             matchLabels:
39 |               id: cassandra
40 | 


--------------------------------------------------------------------------------
/opa-deny-all.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: templates.gatekeeper.sh/v1beta1
 2 | kind: ConstraintTemplate
 3 | metadata:
 4 |   name: k8salwaysdeny
 5 | spec:
 6 |   crd:
 7 |     spec:
 8 |       names:
 9 |         kind: K8sAlwaysDeny
10 |       validation:
11 |         # Schema for the `parameters` field
12 |         openAPIV3Schema:
13 |           properties:
14 |             message:
15 |               type: string
16 |   targets:
17 |     - target: admission.k8s.gatekeeper.sh
18 |       rego: |
19 |         package k8salwaysdeny
20 |         violation[{"msg": msg}] {
21 |           1 > 0
22 |           msg := input.parameters.message
23 |         }
24 | ---
25 | apiVersion: constraints.gatekeeper.sh/v1beta1
26 | kind: K8sAlwaysDeny
27 | metadata:
28 |   name: pod-always-deny
29 | spec:
30 |   match:
31 |     kinds:
32 |       - apiGroups: ['']
33 |         kinds: ['Pod']
34 |   parameters:
35 |     message: 'ACCESS DENIED!'
36 | 


--------------------------------------------------------------------------------
/opa-ns.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: templates.gatekeeper.sh/v1beta1
 2 | kind: ConstraintTemplate
 3 | metadata:
 4 |   name: k8srequiredlabels
 5 | spec:
 6 |   crd:
 7 |     spec:
 8 |       names:
 9 |         kind: K8sRequiredLabels
10 |       validation:
11 |         # Schema for the `parameters` field
12 |         openAPIV3Schema:
13 |           properties:
14 |             labels:
15 |               type: array
16 |               items: string
17 |   targets:
18 |     - target: admission.k8s.gatekeeper.sh
19 |       rego: |
20 |         package k8srequiredlabels
21 |         violation[{"msg": msg, "details": {"missing_labels": missing}}] {
22 |           provided := {label | input.review.object.metadata.labels[label]}
23 |           required := {label | label := input.parameters.labels[_]}
24 |           missing := required - provided
25 |           count(missing) > 0
26 |           msg := sprintf("you must provide labels: %v", [missing])
27 |         }
28 | ---
29 | apiVersion: constraints.gatekeeper.sh/v1beta1
30 | kind: K8sRequiredLabels
31 | metadata:
32 |   name: ns-must-have-cks
33 | spec:
34 |   match:
35 |     kinds:
36 |       - apiGroups: ['']
37 |         kinds: ['Namespace']
38 |   parameters:
39 |     labels: ['cks']
40 | 
41 | # Using input.parameters.labels[_] in the ConstraintTemplate corresponds to when you create the K8sRequiredLabels kind below.
42 | # It looks for what you pass in as spec.parameters.labels
43 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # k8s-cks-notes
  2 | 
  3 | ## ServiceAccounts
  4 | 
  5 | - You can disable automounting of a `ServiceAccount` on a `ServiceAccount` or `Pod` resource: `automountServiceAccountToken: false`
  6 | - `mount | grep sec` to show the mount inside the `Pod`.
  7 | - Mount for token also shows as mounted volume for the `Pod`. Path can be seen there.
  8 | 
  9 | ## Kubernetes API
 10 | 
 11 | Three components:
 12 | 
 13 | - Authentication - Who are you?
 14 | - Authorization - What are you allowed to do?
 15 | - Admission - Admission controllers. Uses mutating and validating webhooks, among others, on resources.
 16 | 
 17 | Each request to the API is filtered through these.
 18 | 
 19 | Requests are treated as:
 20 | 
 21 | - A normal user
 22 | - A `ServiceAccount`
 23 | - Anonymous access
 24 | 
 25 | Every request must authenticate unless it's anonymous.
 26 | 
 27 | - `--anonymous-auth` kubelet flag must be set to `false` to disable anonymous access.
 28 | - `/etc/kubernetes/manifests/kube-apiserver.yaml`
 29 | - `--insecure-port` is set to `0` by default, which disables the insecure port. Setting this to anything else will also disable Authentication and Authorization.
 30 | 
 31 | ## Config
 32 | 
 33 | - `kubectl config view --raw`
 34 | - `k config set-context jane --user=jane --cluster=kubernetes`
 35 | - `k config set-credentials jane --client-key=jane.key --client-certificate=jane.crt --embed-certs`
 36 | - `k config use-context jane`
 37 | 
 38 | You can use a different config file:
 39 | 
 40 | - `k --kubeconfig` or environment variable `KUBECONFIG`
 41 | 
 42 | For example, on a worker `Node`, there is no `kubeconfig` available to use with `kubectl` - but `kubelet` has its own config file that it uses to communicate with the cluster master:
 43 | 
 44 | - `cat /etc/kubernetes/kubelet.conf`
 45 | - `k --kubeconfig /etc/kubernetes/kubelet.conf get node`
 46 | 
 47 | ## Auth
 48 | 
 49 | - `k auth can-i delete deployment -A`
 50 | 
 51 | By extracting the Kubernetes server `ca` and the user `cert` and `key` from `kubectl config view --raw`, you can actually make manual API calls against the Kubernetes API:
 52 | 
 53 | - `curl https://10.142.0.2:6443 --cacert ca --cert cert --key key`
 54 | 
 55 | ## Certificates
 56 | 
 57 | Certificates live at `/etc/kubernetes/pki` on the server. You can also find similar information on clients.
 58 | 
 59 | - `/etc/kubernetes/pki`
 60 | - `openssl x509 -in apiserver.crt -text`
 61 | 
 62 | ## Node Restriction Admission Controller
 63 | 
 64 | In `/etc/kubernetes/manifests/kube-apiserver.yaml`:
 65 | 
 66 | ```yaml
 67 | - --enable-admission-plugins=NodeRestriction
 68 | ```
 69 | 
 70 | This sets the `NodeRestriction` admission controller to enabled. This means that requests are subject to it. For example, this prevents us from labeling the master `Node` from the worker:
 71 | 
 72 | - `k label node cks-master cks/test=yes`
 73 | 
 74 | This will fail - however, we can label our own `Node`:
 75 | 
 76 | - `k label node cks-worker cks/test=yes`
 77 | 
 78 | There are restricted labels that we cannot set ourselves:
 79 | 
 80 | - `node-restriction.kubernetes.io/test=yes` - you couldn't set this.
 81 | 
 82 | This would prevent a malicious user from changing a label like this to allow `Pods` that look for that label to instead run on a compromised `Node`.
 83 | 
 84 | ## Updates
 85 | 
 86 | It's good to update for support, security fixes, bug fixes, and dependencies.
 87 | 
 88 | `1.19.2` - major/minor/patch
 89 | 
 90 | Minor version every 3 months. No Long Term Support.
 91 | 
 92 | Maintenance release branches for the most recent three minor releases - for now, that's `1.19`, `1.18`, and `1.17`.
 93 | 
 94 | #### Update Process
 95 | 
 96 | First, master components are upgraded - `apiserver`, `controller-manager`, `scheduler`.
 97 | 
 98 | Then, worker components are upgraded - `kubelet`, `kube-proxy`.
 99 | 
100 | Components should always have the same minor version as the `apiserver`.
101 | 
102 | `kubelet` can be two minor versions below `apiserver`, but in general don't do this.
103 | 
104 | Stick with same version as `apiserver` or one below for safety.
105 | 
106 | ##### The Process
107 | 
108 | - `kubectl cordon` then `kubectl drain`
109 | - Upgrade
110 | - `kubectl uncordon`
111 | 
112 | ###### Master
113 | 
114 | ```bash
115 | $ k drain cks-master --ignore-daemonsets
116 | $ apt-cache show kubeadm | grep 1.19
117 | $ apt-get install kubeadm=1.19.3-00 kubelet=1.19.3-00 kubectl=1.19.3-00
118 | $ kubeadm upgrade plan
119 | $ kubeadm upgrade apply v1.19.3
120 | $ k uncordon cks-master
121 | ```
122 | 
123 | ###### Worker
124 | 
125 | ```bash
126 | $ k drain cks-worker --ignore-daemonsets (from master)
127 | $ apt-cache show kubeadm | grep 1.19
128 | $ apt-get install kubeadm=1.19.3-00 kubelet=1.19.3-00 kubectl=1.19.3-00
129 | $ systemctl restart kubelet
130 | $ k uncordon cks-worker (from master)
131 | ```
132 | 
133 | ###### Application Resiliency
134 | 
135 | As always, applications should be able to survive an upgrade:
136 | 
137 | - `Pod` termination grace periods.
138 | - `PodDisruptionBudgets`
139 | - Pod Lifecycle Events
140 | 
141 | ## Secrets
142 | 
143 | Usually passwords, API keys, information needed by an application.
144 | 
145 | ## Container Runtime
146 | 
147 | `kubelet` args:
148 | 
149 | - `--container-runtime`
150 | - `--container-runtime-endpoint`
151 | 
152 | `crictl` is an open source adaption that is container and Kubernetes native.
153 | 
154 | Kata containers adds an additional virtualization layer - a bit more like traditional VMs.
155 | 
156 | gVisor (Google) (`runsc`) implements a limited-use kernel that adds further fine grained separation. Runs in user space so it's separated from the Linux kernel.
157 | 
158 | `RuntimeClasses` allow you to specify further runtime environments for objects.
159 | 
160 | You use `spec.runtimeClassName` to associate a `Pod` with a given `RuntimeClass`.
161 | 
162 | ## Security Contexts
163 | 
164 | ```yaml
165 | spec:
166 |   volumes:
167 |     - name: vol
168 |       emptyDir: {}
169 |   securityContext:
170 |     runAsUser: 1000
171 |     runAsGroup: 3000
172 |     fsGroup: 2000
173 |   containers:
174 |     - name: foo
175 |       command:
176 |         - sh
177 |         - c
178 |         - sleep 1d
179 |       image: busybox
180 |       resources: {}
181 |       securityContext:
182 |         runAsUser: 0
183 | ```
184 | 
185 | Notice how here, `securityContext` is set top level for all `Pods`, but you can override for a specific container.
186 | 
187 | Check out API reference from docs if you want to know more about what a flag does.
188 | 
189 | Forcing running as non-root:
190 | 
191 | ```yaml
192 | spec:
193 |   volumes:
194 |     - name: vol
195 |       emptyDir: {}
196 |   securityContext:
197 |     runAsUser: 1000
198 |     runAsGroup: 3000
199 |     fsGroup: 2000
200 |   containers:
201 |     - name: foo
202 |       command:
203 |         - sh
204 |         - c
205 |         - sleep 1d
206 |       image: busybox
207 |       resources: {}
208 |       securityContext:
209 |         runAsNonRoot: true
210 | ```
211 | 
212 | This may error if the image runs as root and there is no top-level `securityContext`: `Error: container has runAsNonRoot and image will run as root`
213 | 
214 | ## Privileged Containers
215 | 
216 | ```yaml
217 | spec:
218 |   volumes:
219 |     - name: vol
220 |       emptyDir: {}
221 |   containers:
222 |     - name: foo
223 |       command:
224 |         - sh
225 |         - c
226 |         - sleep 1d
227 |       image: busybox
228 |       resources: {}
229 |       securityContext:
230 |         privileged: true
231 | ```
232 | 
233 | Setting `privileged: true` will allow the given `Pod` to make OS-level changes, `sysctl` etc., and interact with the kernel. This is bad practice.
234 | 
235 | Privileged means the container user `root (0)` is directly mapped to host `root (0)`.
236 | 
237 | ## Privilege Escalation
238 | 
239 | Privilege Escalation controls whether a process can gain more privileges than its parent process.
240 | 
241 | ```yaml
242 | spec:
243 |   volumes:
244 |     - name: vol
245 |       emptyDir: {}
246 |   containers:
247 |     - name: foo
248 |       command:
249 |         - sh
250 |         - c
251 |         - sleep 1d
252 |       image: busybox
253 |       resources: {}
254 |       securityContext:
255 |         allowPrivilegeEscalation: true
256 | ```
257 | 
258 | `allowPrivilegeEscalation: true` is the default. You can set to `false` to disable this behavior.
259 | 
260 | ## Pod Security Policies
261 | 
262 | Cluster-level resources. Controls under which security conditions a `Pod` has to run.
263 | 
264 | Pod Security Policy runs as an admission controller. A `Pod` will only be created if it adheres to these rules. It inspects the security contexts.
265 | 
266 | Enabling this will deny all `Pods` from being created in the cluster since out of the box none of the `ServiceAccounts` have the necessary permissions to look at `PodSecurityPolicies`.
267 | 
268 | As an admin user, we can create `Pods`, but they wouldn't create as a result of a `Deployment`, as an example, since that creates using the `ServiceAccount` we talked about before.
269 | 
270 | You'd have to give the `ServiceAccount`, in this case the `default`, the ability to evaluate `PodSecurityPolicies`:
271 | 
272 | - `k create role psp-access --verb=use --resource=podsecuritypolicies`
273 | - `k create rolebinding psp-access --role=psp-access --serviceaccount=default:default`
274 | 
275 | You'd want to create the proper RBAC and `PodSecurityPolicy` resources before enabling this.
276 | 
277 | ## Mutual TLS (mTLS)
278 | 
279 | Two-way bilateral authentication. Two parties authenticate to each other to create a secure communication channel.
280 | 
281 | By default, all `Pods` can communicate with each other as an implicit function of the chosen CNI.
282 | 
283 | Typically, TLS is terminated at an `Ingress` and is unencrypted on the backend in the cluster, between `Pods`, etc.
284 | 
285 | You could use a sidcar proxy container to handle this encryption overhead.
286 | 
287 | Something like Istio works with this model - a managed proxy that takes care of certificates.
288 | 
289 | You could use an `initContainer` that creates `iptables` rules that would force all traffic from your application's `Pods` through the proxy container.
290 | 
291 | All containers in a `Pod` have access to the same network namespace provided you add the capability as given in the example.
292 | 
293 | ## Open Policy Agent
294 | 
295 | Open source, general purpose policy engine. You've used this before. It works in Kubernetes too.
296 | 
297 | Uses rego in Kubernetes the same way. Works with JSON/YAML. It is not natively Kubernetes aware with its resources, for example.
298 | 
299 | OPA Gatekeeper creates CRDs to allow Kubernetes to work with OPA.
300 | 
301 | ```yaml
302 | apiVersion: templates.gatekeeper.sh/v1beta1
303 | kind: ConstraintTemplate
304 | metadata:
305 |   name: k8srequiredlabels
306 | ```
307 | 
308 | ```yaml
309 | apiVersion: constraints.gatekeeper.sh/v1beta1
310 | kind: K8sRequiredLabels
311 | metadata:
312 |   name: pod-must-have-foolabel
313 | ---
314 | apiVersion: constraints.gatekeeper.sh/v1beta1
315 | kind: K8sRequiredLabels
316 | metadata:
317 |   name: pod-must-have-barlabel
318 | ```
319 | 
320 | It creates admission webhooks - there are two kinds - `validating` and `mutating`. Mutating webhooks are invoked first and can modify objects. After this is done, Validating webhooks are invoked and can reject requests to enforce custom policies.
321 | 
322 | Use Rego playground to test OPA policies.
323 | 
324 | ## Image Footprints
325 | 
326 | Remember that containers use syscalls against the host OS kernel. It is possible to run containers using the same PID so they share processes.
327 | 
328 | Docker images are built using layers. Only the instructions `RUN`, `COPY`, and `ADD` create layers. Other instructions create temporary intermediate images, and do not increase the size of the build.
329 | 
330 | An image size factors in the base image size plus additional layers, etc.
331 | 
332 | ```Dockerfile
333 | FROM ubuntu
334 | ARG DEBIAN_FRONTEND=noninteractive
335 | RUN apt-get update && apt-get install -y golang-go
336 | COPY app.go .
337 | RUN CGO_ENABLED=0 go build app.go
338 | 
339 | FROM alpine
340 | RUN chmod a-w /etc
341 | RUN addgroup -S appgroup && adduser -S appuser -G appgroup -h /home/appuser
342 | RUN rm -rf /bin/*
343 | COPY --from=0 /app /home/appuser
344 | USER appuser
345 | CMD ["./app"]
346 | ```
347 | 
348 | Using this logic, the `--from=0` here step copies from the first stage, stage 0, and stage 1 at the bottom copies the file into the local directory in the resultant final image.
349 | 
350 | This essentially gives you a resulting image that only encompasses the final stage, reducing its size.
351 | 
352 | ### Hardening An Image
353 | 
354 | - Using specific versions in a Dockerfile is more secure instead of using something like `latest`.
355 | - Don't run as root in a container. In the example above, we establish a dedicated user and then run as that user by calling `USER`.
356 | - Making the filesystem read only is also more secure. This avoids allowing write access to a given filesystem running as part of a container. Using the line `RUN chmod a-w /etc`, we remove write permissions for the `/etc` directory for all users.
357 | - Removing shell access is also an optimization. We do this in the Dockerfile by running `RUN rm -rf /bin/*` - this essentially removes the ability to run anything located in that directory. This explains why sometimes you cannot exec into a container (think top level Kubernetes `Pods`) because that is missing.
358 | - In general, running commands together in a layer, like `apt-get update && apt-get install` all in the same operation is better and decreases size. Also cleanup install packages in the build.
359 | 
360 | ## Static Analysis
361 | 
362 | Looks at the source code and text files and parses them to check against rules. Those rules can then be enforced.
363 | 
364 | Examples:
365 | 
366 | - Always define resource requests and limits.
367 | - `Pods` should never use the default `ServiceAccount`.
368 | - Don't store sensitive data in plain text in Dockerfiles or Kubernetes resources.
369 | 
370 | Twistlock or Sysdig is an example of this.
371 | 
372 | You could do this in an image build phase or after the build phase in a test phase. We do this by leveraging the Sysdig inline scanner.
373 | 
374 | `PodSecurityPolicies` and OPA can be used within the cluster for static analysis.
375 | 
376 | For example, pulling info from a `Secret` as an environment variable is more secure than hardcoding things. Look out for obvious stuff.
377 | 
378 | Kubesec is risk analysis for Kubernetes resources. It's open source and opinionated. It runs using a fixed set of rules based on security best practices. It can run as binary, a Docker container, a `kubectl` plugin, or admission controller. Remember that tools like Sysdig also offer an admission controller. You can also past a manifest into Kubesec to evaluate it on demand.
379 | 
380 | Remember these important features for `securityContext`:
381 | - `readOnlyRootfilesystem = true`
382 | - `runAsNonRoot = true`
383 | - `runAsUser -gt 10000`
384 | - `capabilities .drop`
385 | 
386 | OPA offers a tool called `conftest` that uses the same OPA rego language. You can run this in Docker as well.
387 | 
388 | You can also use `confest` against Dockerfiles.
389 | 
390 | ## Image Vulnerability Scanning
391 | 
392 | Containers that contain exploitable packages are a problem. This could result in privilege escalation, data leaks, DDoS, etc.
393 | 
394 | https://cve.mitre.org/
395 | https://nvd.nist.gov/
396 | 
397 | Tools use these databases to scan images for vulnerabilities. We use the Sysdig inline scanner for this. You could stop a build, or use an Admission Controller to not allow a compromised image version to run in a cluster.
398 | 
399 | You could also restrict based on registry hostnames within the cluster using something like OPA or `PodSecurityPolicies`. This would happen either in `MutatingAdmission` or `Validating` webhook stages.
400 | 
401 | This type of scanning could also take place in a container registry like GCR or ECR.
402 | 
403 | ### Clair
404 | 
405 | Open source vulnerability assessment tool. CNCF supported. Uses vulnerability databases.
406 | 
407 | ### Trivy
408 | 
409 | Also open source. One command to run it.
410 | 
411 | `$ docker run ghcr.io/aquasecurity/trivy:latest image nginx`
412 | 
413 | This will cross reference failures with CVE numbers.
414 | 
415 | ## Supply Chain Security
416 | 
417 | Using a private registry is an example of a secure supply chain component.
418 | 
419 | You can create a `docker-registry` `Secret` in Kubernetes and then associate the `imagePullSecrets` to the `ServiceAccounts`, for example.
420 | 
421 | You can run an image using an image digest instead of a tag since tags can theoretically change and point to different digests.
422 | 
423 | You could use OPA via the Admission Controller to limit images to specific repositories.
424 | 
425 | Remember that you create a kind of `ConstraintTemplate` that uses `spec.crd.spec.names[0].kind=K8sTrustedImages` then create a `K8sTrustedImages` object. The admission webhook would then deny creation of a `Pod` if its spec fails the checks specified by these templates.
426 | 
427 | `ImagePolicyWebhook` creates a kind of `ImageReview` which can be assessed by an external tool as part of an admission workflow.
428 | 
429 | You would add this to `--enable-admission-plugins` as `ImagePolicyWebhook` to enable the Admission Controller.
430 | 
431 | - `--admission-control-config-file=path-to-admission-config.yaml`
432 | 
433 | You have to adjust Volumes to mount this data in the `kube-apiserver` `Pod`. You must have certificates, a `kubeconfig`, etc.
434 | 
435 | `AdmissionConfiguration` kind is where this logic is configured. Remember that `defaultAllow` is set to `false` by default which means no `Pods` will create out of the box if this configuration is incomplete.
436 | 
437 | ## Behavioral Analytics
438 | 
439 | Syscall interface is provided by the kernel, for example `getpid()` or `reboot()`.
440 | 
441 | Applications run in the user space. Applications can communicate with the syscall interface or they can use libraries. The request is then passed to the kernel and the hardware.
442 | 
443 | `seccomp` and AppArmor lie between the user space and syscall interface for added protection.
444 | 
445 | Processes in a container are able to communicate with the kernel given how they run in shared spaces. They're namespaced via container logic, but can still talk to the kernel.
446 | 
447 | `strace` intercepts and logs system calls made by a process. It can also log and display signals received by a process so it's good for debugging etc.
448 | 
449 | - `$ strace ls /`
450 | 
451 | This would provide a list of syscalls made to the kernel.
452 | 
453 | In the end, all commands run on the command line result in syscalls for how they operate.
454 | 
455 | - `/proc` directory contains information and connections to processes and kernel.
456 | - Study it to learn how processes work.
457 | - Configuration and administrative tasks.
458 | - Contains files that don't technically exist.
459 | 
460 | You can do `docker ps | grep etcd` and then `ps aux | grep etcd` to find the running `etcd` process.
461 | 
462 | - `$ strace -p 3502 -f` - this would show you syscalls made by a process, in this case `etcd`. `-f` follows it.
463 | 
464 | You can go to `cd /proc/3502` and `ls` and see open files related to a process - `etcd` in this case.
465 | 
466 | Writes to `etcd` for things like `Secrets` will show up in `/proc/3502/fd/7 (symlink)` as an example.
467 | 
468 | - `$ pstree -p` shows process tree of running processes. You could find `containerd` in here and see a list of running containers.
469 | 
470 | You could use this to find the `pid` of a running container, navigate to its `/proc/pid` folder and `cat environ` and see environment variable data.
471 | 
472 | ### Falco
473 | 
474 | CNCF native runtime security. Deep kernel tracing built on the Linux kernel.
475 | 
476 | Describe security rules against a system, detect unwanted behavior.
477 | 
478 | Automated response to security violations.
479 | 
480 | Kubernetes docs has a page specific to Falco with instructions for installation.
481 | 
482 | `/etc/falco` where configuration files are stored.
483 | 
484 | - `$ tail -f /var/log/syslog | grep falco`
485 | 
486 | You would see log output from `falco` activity that shows running processes, package management launched, shell executions, etc.
487 | 
488 | Know how to find those rules and review them.
489 | 
490 | ## Immutability of Containers at Runtime
491 | 
492 | Immutability simply means the container won't be modified during its lifetime.
493 | 
494 | ### Mutable
495 | 
496 | - `ssh` to a container, stop application, update application, start application
497 | 
498 | ### Immutable
499 | 
500 | - Create new container image, delete container instance, create new container instance.
501 | 
502 | With immutable containers, we always know the state. With mutable instances, we know less.
503 | 
504 | Immutability allows us to use advanced deployment methods native to Kubernetes, easy rollbacks, more reliability, and better security.
505 | 
506 | ### Making Containers Immutable
507 | 
508 | - Remove bash/shell, make filesystem read only, run as user and non-root
509 | - You could remove write privileges to all non-essential directories in line using `command`.
510 | - `startupProbe` - runs prior to liveness/readiness checks. You could use this to enforce changes too.
511 | - Use `SecurityContexts` and `PodSecurityPolicies` to enforce read only fs, etc.
512 | - You could use an init container to handle read/write permissions and then harden the app container.
513 | 
514 | ### StartupProbe
515 | 
516 | ```yaml
517 | apiVersion: v1
518 | kind: Pod
519 | metadata:
520 |   creationTimestamp: null
521 |   labels:
522 |     run: immutable
523 |   name: immutable
524 | spec:
525 |   containers:
526 |   - image: httpd
527 |     name: immutable
528 |     resources: {}
529 |     startupProbe:
530 |       exec:
531 |         command:
532 |           - rm
533 |           - /bin/touch
534 |       initialDelaySeconds: 1
535 |       periodSeconds: 5
536 |   dnsPolicy: ClusterFirst
537 |   restartPolicy: Always
538 | status: {}
539 | ```
540 | 
541 | ### SecurityContext
542 | 
543 | ```yaml
544 | apiVersion: v1
545 | kind: Pod
546 | metadata:
547 |   creationTimestamp: null
548 |   labels:
549 |     run: immutable
550 |   name: immutable
551 | spec:
552 |   containers:
553 |   - image: httpd
554 |     name: immutable
555 |     resources: {}
556 |     securityContext:
557 |       readOnlyRootFilesystem: true
558 |     volumeMounts:
559 |       - mountPath: /usr/local/apache2/logs
560 |         name: cache-volume
561 |   volumes:
562 |     - name: cache-volume
563 |       emptyDir: {}
564 |   dnsPolicy: ClusterFirst
565 |   restartPolicy: Always
566 | status: {}
567 | ```
568 | 
569 | In this example, we use `readOnlyRootFilesystem` but `apache` will complain if it can't write to the directory it needs for logs. We get around this by using `emptyDir` to allow it to write to that temporary directory.
570 | 
571 | ## Kubernetes Auditing
572 | 
573 | Kubernetes is all based on API requests. Auditing allows us to hold those requests so that we can review them for security purposes.
574 | 
575 | - Did someone access an important `Secret` while it was not protected? We can check who accessed it.
576 | - When was the last time that user X did access cluster Y?
577 | - Does my CRD work properly?
578 | 
579 | Requests to the Kubernetes API run through stages:
580 | 
581 | - `RequestReceived` - The stage for events generated as soon as the audit handler receives the request, and before it is delegated down the handler chain.
582 | - `ResponseStarted` - Once the response headers are sent, but before the response body is sent. This stage is only generated for long-running requests (e.g. `watch`).
583 | - `ResponseComplete` - The response body has been completed and no more bytes will be sent.
584 | - `Panic` - Events generated when a panic occurred.
585 | 
586 | Using these stages, we can customize exactly what we want to log.
587 | 
588 | Audit policy rule levels:
589 | 
590 | - `None` - don't log events that match this rule.
591 | - `Metadata` - log request metadata (requesting user, timestamp, resource, verb, etc.) not not request or response body.
592 | - `Request` - log event metadata and request body but not response body. This does not apply for non-resource requests.
593 | - `RequestResponse` - log event metadata, request, and response bodies. This does not apply for non-resource requests.
594 | 
595 | ```yaml
596 | apiVersion: audit.k8s.io/v1
597 | kind: Policy
598 | omitStages:
599 |   - "RequestReceived"
600 | rules:
601 | # log no "read" actions
602 |   - level: None
603 |     verbs: ["get", "watch", "list"]
604 | 
605 | # log nothing regarding events
606 |   - level: None
607 |     resources:
608 |       - group: "" # core
609 |         resources: ["events"]
610 | 
611 | # log nothing coming from some groups
612 |   - level: None
613 |     userGroups: ["system:nodes"]
614 |   - level: RequestResponse
615 |     resources:
616 |       - group: ""
617 |         resources: ["secrets"]
618 | 
619 | # for everything else log
620 |   - level: Metadata
621 | ```
622 | 
623 | ### Configuring
624 | 
625 | - `$ mkdir /etc/kubernetes/audit`
626 | - Create `policy.yaml` in the `audit` folder.
627 | - Edit `kube-apiserver.yaml` with the following flags:
628 |   - `--audit-policy-file=/etc/kubernetes/audit/policy.yaml`
629 |   - `--audit-log-path=/etc/kubernetes/audit/logs/audit.log`
630 |   - `--audit-log-maxsize=500`
631 |   - `--audit-log-maxbackup=5`
632 | - Edit `kube-apiserver.yaml` to mount the location in the `apiserver` `Pods` to store the logs:
633 | 
634 | ```yaml
635 |     volumeMounts:
636 |     - mountPath: /etc/kubernetes/audit
637 |       name: audit
638 |   volumes:
639 |     - hostPath:
640 |         path: /etc/kubernetes/audit
641 |         type: DirectoryOrCreate
642 |       name: audit
643 | ```
644 | 
645 | Use the docs for auditing to help with this part.
646 | 
647 | Backends for storing audit data can be JSON logs, external APIs (webhooks), dynamic backend (ElasticSearch, FileBeat, Fluentd).
648 | 
649 | This file is largely use for parsing on your own. It should really be exported somewhere useful and readable. But you can do something like create a `Secret` and `grep` for it in the log file.
650 | 
651 | ### Advanced Audit Policy
652 | 
653 | ```yaml
654 | apiVersion: audit.k8s.io/v1
655 | kind: Policy
656 | omitStages:
657 |   - 'RequestReceived'
658 | rules:
659 |   # log no "read" actions
660 |   - level: None
661 |     verbs: ['get', 'watch', 'list']
662 | 
663 |   # log only metadata from Secrets
664 |   - level: Metadata
665 |     resources:
666 |       - group: ''
667 |         resources: ['secrets']
668 | 
669 |   # for everything else log
670 |   - level: RequestResponse
671 | ```
672 | 
673 | If you create a policy and the `kube-apiserver` `Pod` doesn't come back up, remember the trick using `/var/log/pods` to see what's going on. You'll likely see more than one since the new one didn't start, so use the latest and read its logs.
674 | 
675 | ### Looking at API Access History for Secrets
676 | 
677 | ```yaml
678 | apiVersion: audit.k8s.io/v1
679 | kind: Policy
680 | omitStages:
681 |   - 'RequestReceived'
682 | rules:
683 |   # log no "read" actions
684 |   - level: None
685 |     verbs: ['get', 'watch', 'list']
686 | 
687 |   # log only metadata from Secrets
688 |   - level: RequestResponse
689 |     resources:
690 |       - group: ''
691 |         resources: ['secrets']
692 | 
693 |   # for everything else
694 |   - level: RequestResponse
695 | ```
696 | 
697 | Commenting out the line for the file path for the policy is enough to cause the `apiserver` to restart. You'll have to uncomment it to get it to restart again and use your new policy.
698 | 
699 | You could use this to see API calls for things - including patching objects, etc.
700 | 
701 | ## Kernel Hardening Tools
702 | 
703 | Processes are restricted in namespaces which restrict what they can see - users, filesystems, other processes. `cgroups` restrict the resource usage of processes.
704 | 
705 | Between the syscall interface and top level spaces like user and app space, we can use AppArmor and `seccomp` to harden the kernel.
706 | 
707 | ### AppArmor
708 | 
709 | You create profiles in AppArmor to define what a process can and cannot do. You could do this for something like `kubelet` or any other app.
710 | 
711 | Profiles have the concept of modes:
712 | 
713 | - `Unconfined` - process can escape, nothing is enforced.
714 | - `Complain` - processes can escape but will be logged.
715 | - `Enforce` - processes cannot escape, cannot do more than we allow them to do in their profile.
716 | 
717 | Commands:
718 | 
719 | - `aa-status` - show profiles
720 | - `aa-genprof` - create new profile (smart wrapper around `aa-logprof`)
721 | - `aa-complain` - put profile in complain mode
722 | - `aa-enforce` - put profile in enforce mode
723 | - `aa-logprof` - update the profile if app produced some more usage logs (syslog)
724 | 
725 | You could do this for `curl`: `$ aa-genprof curl`
726 | 
727 | Profiles are located in `/etc/apparmor.d`.
728 | 
729 | You could then run `aa-logprof` to review recommended changes and then save them so your processes are more secure.
730 | 
731 | #### With Docker
732 | 
733 | `/etc/apparmor.d` is where policies go.
734 | 
735 | For example `/etc/apparmor.dr/docker-nginx`
736 | 
737 | `$ apparmor_parser path-to-file` adds it.
738 | 
739 | `$ docker run --security-opt apparmor=docker-nginx`
740 | 
741 | #### With Kubernetes
742 | 
743 | Container runtime need sto support AppArmor. Docker does by default.
744 | 
745 | AppArmor must be installed on every node, and the profiles need to be available on every node.
746 | 
747 | AppArmor profiles are specified **per container**, not per `Pod`.
748 | 
749 | ```yaml
750 | apiVersion: v1
751 | kind: Pod
752 | metadata:
753 |   annotations:
754 |     container.apparmor.security.beta.kubernetes.io/secure: localhost/docker-nginx # you reference the name of the pod after / and the profile as the value
755 |   labels:
756 |     run: secure
757 |   name: secure
758 | spec:
759 |   containers:
760 |   - image: nginx
761 |     name: secure
762 |     resources: {}
763 |   dnsPolicy: ClusterFirst
764 |   restartPolicy: Always
765 | status: {}
766 | ```
767 | 
768 | The profile must exist in order for the `Pod` to create, otherwise its creation will be `Blocked`.
769 | 
770 | Annotations for AppArmor can be found in Kubernetes docs.
771 | 
772 | ### seccomp
773 | 
774 | "Security computing mode."
775 | 
776 | Security facility in the Linux kernel.
777 | 
778 | Restricts execution of syscalls made by processes.
779 | 
780 | By default, applications can make syscalls as needed. Using `seccomp`, we can restrict syscalls.
781 | 
782 | It onlyallows `exit()`, `sigreturn()`, `read()`, and `write()` by default.
783 | 
784 | Nowadays `seccomp` is combined with BPF to make `seccomp-bpf`.
785 | 
786 | #### With Docker
787 | 
788 | `$ docker run --security-opt seccomp=default.json nginx`
789 | 
790 | #### With Kubernetes
791 | 
792 | In order for kubelet to use `seccomp` profiles, you can consult the Kubernetes docs and find the argument for specifying the directory, which is `/var/lib/kubelet/seccomp` by default.
793 | 
794 | You can create the directory and then add your profile to it.
795 | 
796 | ```yaml
797 | apiVersion: v1
798 | kind: Pod
799 | metadata:
800 |   #annotations:
801 |   #  seccomp.security.alpha.kubernetes.io/secure: localhost/profiles/default.json # pre 1.19 you could use this annotation
802 |   labels:
803 |     run: secure
804 |   name: secure
805 | spec:
806 |   # in 1.19+, you enable seccomp using securityContext
807 |   securityContext:
808 |     seccompProfile:
809 |       type: Localhost
810 |       localhostProfile: default.json
811 |   containers:
812 |   - image: nginx
813 |     name: secure
814 |     resources: {}
815 |   dnsPolicy: ClusterFirst
816 |   restartPolicy: Always
817 | status: {}
818 | ```
819 | 
820 | Like AppArmor, if the profile doesn't exist, `Pod` will not be created with status `CreateContainerError`. Running `describe` will show this error.
821 | 
822 | ## Reduce Attack Surface of Host
823 | 
824 | Attack surface consists of anything exposed that can be sourced by malicious code. Networking, applications, IAM, etc.
825 | 
826 | Applications and kernel should always be up to date. Additional packages should not be present if not needed.
827 | 
828 | Use a firewall to eliminate access to a port.
829 | 
830 | Run applications as a specific user instead of root to avoid privilege escalation.
831 | 
832 | `Nodes` should be ephemeral. They should be created from images and immutable. They should be recycled efficiently and without downtime.
833 | 
834 | A lot of included services with base level OS images, like `ubuntu`, are unnecessary and increase attack surface.
835 | 
836 | - `$ netstat -plnt`
837 | - `$ lsof -i :22`
838 | - `$ systemctl status kubelet`
839 | - `$ ps aux`
840 | 
841 | For example, to disable a service:
842 | 
843 | - `$ systemctl list-units --type=service --state=running | grep snapd`
844 | - `$ systemctl stop snapd`
845 | - `$ systemctl disable snapd`
846 | 
847 | Show users:
848 | 
849 | - `$ cat /etc/passwd`
850 | 
851 | Add users:
852 | 
853 | - `$ adduser foo`
854 |   
855 | Delete users:
856 | 
857 | - `$ deluser foo`
858 | 
859 | Show bash processes:
860 | 
861 | - `$ ps aux | grep bash`
862 | 
863 | ## Handy Stuff
864 | 
865 | - `/var/log/pods` - You can see `Pod` logs here if you're unable to use the API for some reason.
866 | 


--------------------------------------------------------------------------------