├── .gitignore ├── np-allow-metadata.yaml ├── np-deny-metadata.yaml ├── pod-security-policy.yaml ├── np-back-to-db.yaml ├── np-default-deny.yaml ├── network-policy.yaml ├── proxy-example.yaml ├── ingress.yaml ├── pod.yaml ├── np-front-to-back.yaml ├── opa-deny-all.yaml ├── opa-ns.yaml └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | manifests/ 2 | scratch.yaml 3 | NOTES.md 4 | -------------------------------------------------------------------------------- /np-allow-metadata.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.k8s.io/v1 2 | kind: NetworkPolicy 3 | metadata: 4 | name: allow-metadata-access 5 | namespace: default 6 | spec: 7 | podSelector: 8 | matchLabels: 9 | role: metadata-accessor 10 | policyTypes: 11 | - Egress 12 | egress: 13 | - to: 14 | - ipBlock: 15 | cidr: 169.254.169.254/32 16 | -------------------------------------------------------------------------------- /np-deny-metadata.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.k8s.io/v1 2 | kind: NetworkPolicy 3 | metadata: 4 | name: deny-metadata-access 5 | namespace: default 6 | spec: 7 | podSelector: {} 8 | policyTypes: 9 | - Egress 10 | egress: 11 | - to: 12 | - ipBlock: 13 | cidr: 0.0.0.0/0 14 | except: 15 | - 169.254.169.254/32 16 | -------------------------------------------------------------------------------- /pod-security-policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: policy/v1beta1 2 | kind: PodSecurityPolicy 3 | metadata: 4 | name: example-psp 5 | spec: 6 | allowPrivilegeEscalation: false 7 | privileged: false # Don't allow privileged pods! 8 | # The rest fills in some required fields. 9 | seLinux: 10 | rule: RunAsAny 11 | supplementalGroups: 12 | rule: RunAsAny 13 | runAsUser: 14 | rule: RunAsAny 15 | fsGroup: 16 | rule: RunAsAny 17 | volumes: 18 | - '*' 19 | -------------------------------------------------------------------------------- /np-back-to-db.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.k8s.io/v1 2 | kind: NetworkPolicy 3 | metadata: 4 | name: pod-db-backend-ingress 5 | namespace: cassandra 6 | spec: 7 | podSelector: 8 | matchLabels: 9 | run: db 10 | policyTypes: 11 | - Ingress 12 | ingress: 13 | - from: 14 | - namespaceSelector: 15 | matchLabels: 16 | id: default 17 | - podSelector: 18 | matchLabels: 19 | run: backend 20 | -------------------------------------------------------------------------------- /np-default-deny.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.k8s.io/v1 2 | kind: NetworkPolicy 3 | metadata: 4 | name: default-deny-all 5 | namespace: default 6 | spec: 7 | podSelector: {} 8 | policyTypes: 9 | - Ingress 10 | - Egress 11 | --- 12 | apiVersion: networking.k8s.io/v1 13 | kind: NetworkPolicy 14 | metadata: 15 | name: default-deny-all-cassandra 16 | namespace: cassandra 17 | spec: 18 | podSelector: {} 19 | policyTypes: 20 | - Ingress 21 | - Egress 22 | -------------------------------------------------------------------------------- /network-policy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: NetworkPolicy 3 | metadata: 4 | name: my-network-policy 5 | namespace: default 6 | spec: 7 | podSelector: 8 | matchLabels: 9 | id: frontend 10 | policyTypes: 11 | - Engress 12 | egress: 13 | - to: 14 | - namespaceSelector: 15 | matchLabels: 16 | id: ns1 17 | ports: 18 | - protocol: TCP 19 | port: 80 20 | - to: 21 | - podSelector: 22 | matchLabels: 23 | id: backend 24 | -------------------------------------------------------------------------------- /proxy-example.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | labels: 5 | run: app 6 | name: app 7 | spec: 8 | containers: 9 | - command: 10 | - sh 11 | - -c 12 | - ping google.com 13 | image: bash 14 | name: app 15 | resources: {} 16 | - name: proxy 17 | image: ubuntu 18 | command: 19 | - sh 20 | - -c 21 | - 'apt-get update && apt-get install iptables -y && iptables -L && sleep 1d' 22 | securityContext: 23 | capabilities: 24 | add: ['NET_ADMIN'] 25 | dnsPolicy: ClusterFirst 26 | restartPolicy: Always 27 | -------------------------------------------------------------------------------- /ingress.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.k8s.io/v1 2 | kind: Ingress 3 | metadata: 4 | name: minimal-ingress 5 | annotations: 6 | #kubernetes.io/ingress-class: nginx 7 | nginx.ingress.kubernetes.io/rewrite-target: / 8 | spec: 9 | rules: 10 | - http: 11 | paths: 12 | - path: /frontend 13 | backend: 14 | service: 15 | name: frontend 16 | port: 17 | number: 80 18 | - path: /backend 19 | backend: 20 | service: 21 | name: backend 22 | port: 23 | number: 80 24 | -------------------------------------------------------------------------------- /pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | labels: 5 | run: pod1 6 | name: pod1 7 | spec: 8 | containers: 9 | - image: nginx 10 | env: 11 | - name: FOO 12 | valueFrom: 13 | secretKeyRef: 14 | name: secret1 15 | key: something 16 | name: pod1 17 | volumeMounts: 18 | - name: secret1 19 | mountPath: '/etc/foo' 20 | readOnly: true 21 | resources: 22 | cpu: 23 | request: 24 | limit: 25 | memory: 26 | request: 27 | limit: 28 | volumes: 29 | - name: secret1 30 | secret: 31 | secretName: secret1 32 | dnsPolicy: ClusterFirst 33 | restartPolicy: Always 34 | -------------------------------------------------------------------------------- /np-front-to-back.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.k8s.io/v1 2 | kind: NetworkPolicy 3 | metadata: 4 | name: pod-frontend 5 | namespace: default 6 | spec: 7 | podSelector: 8 | matchLabels: 9 | run: frontend 10 | policyTypes: 11 | - Egress 12 | egress: 13 | - to: 14 | - podSelector: 15 | matchLabels: 16 | run: backend 17 | --- 18 | apiVersion: networking.k8s.io/v1 19 | kind: NetworkPolicy 20 | metadata: 21 | name: pod-backend 22 | namespace: default 23 | spec: 24 | podSelector: 25 | matchLabels: 26 | run: backend 27 | policyTypes: 28 | - Ingress 29 | - Egress 30 | ingress: 31 | - from: 32 | - podSelector: 33 | matchLabels: 34 | run: frontend 35 | egress: 36 | - to: 37 | - namespaceSelector: 38 | matchLabels: 39 | id: cassandra 40 | -------------------------------------------------------------------------------- /opa-deny-all.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: templates.gatekeeper.sh/v1beta1 2 | kind: ConstraintTemplate 3 | metadata: 4 | name: k8salwaysdeny 5 | spec: 6 | crd: 7 | spec: 8 | names: 9 | kind: K8sAlwaysDeny 10 | validation: 11 | # Schema for the `parameters` field 12 | openAPIV3Schema: 13 | properties: 14 | message: 15 | type: string 16 | targets: 17 | - target: admission.k8s.gatekeeper.sh 18 | rego: | 19 | package k8salwaysdeny 20 | violation[{"msg": msg}] { 21 | 1 > 0 22 | msg := input.parameters.message 23 | } 24 | --- 25 | apiVersion: constraints.gatekeeper.sh/v1beta1 26 | kind: K8sAlwaysDeny 27 | metadata: 28 | name: pod-always-deny 29 | spec: 30 | match: 31 | kinds: 32 | - apiGroups: [''] 33 | kinds: ['Pod'] 34 | parameters: 35 | message: 'ACCESS DENIED!' 36 | -------------------------------------------------------------------------------- /opa-ns.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: templates.gatekeeper.sh/v1beta1 2 | kind: ConstraintTemplate 3 | metadata: 4 | name: k8srequiredlabels 5 | spec: 6 | crd: 7 | spec: 8 | names: 9 | kind: K8sRequiredLabels 10 | validation: 11 | # Schema for the `parameters` field 12 | openAPIV3Schema: 13 | properties: 14 | labels: 15 | type: array 16 | items: string 17 | targets: 18 | - target: admission.k8s.gatekeeper.sh 19 | rego: | 20 | package k8srequiredlabels 21 | violation[{"msg": msg, "details": {"missing_labels": missing}}] { 22 | provided := {label | input.review.object.metadata.labels[label]} 23 | required := {label | label := input.parameters.labels[_]} 24 | missing := required - provided 25 | count(missing) > 0 26 | msg := sprintf("you must provide labels: %v", [missing]) 27 | } 28 | --- 29 | apiVersion: constraints.gatekeeper.sh/v1beta1 30 | kind: K8sRequiredLabels 31 | metadata: 32 | name: ns-must-have-cks 33 | spec: 34 | match: 35 | kinds: 36 | - apiGroups: [''] 37 | kinds: ['Namespace'] 38 | parameters: 39 | labels: ['cks'] 40 | 41 | # Using input.parameters.labels[_] in the ConstraintTemplate corresponds to when you create the K8sRequiredLabels kind below. 42 | # It looks for what you pass in as spec.parameters.labels 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # k8s-cks-notes 2 | 3 | ## ServiceAccounts 4 | 5 | - You can disable automounting of a `ServiceAccount` on a `ServiceAccount` or `Pod` resource: `automountServiceAccountToken: false` 6 | - `mount | grep sec` to show the mount inside the `Pod`. 7 | - Mount for token also shows as mounted volume for the `Pod`. Path can be seen there. 8 | 9 | ## Kubernetes API 10 | 11 | Three components: 12 | 13 | - Authentication - Who are you? 14 | - Authorization - What are you allowed to do? 15 | - Admission - Admission controllers. Uses mutating and validating webhooks, among others, on resources. 16 | 17 | Each request to the API is filtered through these. 18 | 19 | Requests are treated as: 20 | 21 | - A normal user 22 | - A `ServiceAccount` 23 | - Anonymous access 24 | 25 | Every request must authenticate unless it's anonymous. 26 | 27 | - `--anonymous-auth` kubelet flag must be set to `false` to disable anonymous access. 28 | - `/etc/kubernetes/manifests/kube-apiserver.yaml` 29 | - `--insecure-port` is set to `0` by default, which disables the insecure port. Setting this to anything else will also disable Authentication and Authorization. 30 | 31 | ## Config 32 | 33 | - `kubectl config view --raw` 34 | - `k config set-context jane --user=jane --cluster=kubernetes` 35 | - `k config set-credentials jane --client-key=jane.key --client-certificate=jane.crt --embed-certs` 36 | - `k config use-context jane` 37 | 38 | You can use a different config file: 39 | 40 | - `k --kubeconfig` or environment variable `KUBECONFIG` 41 | 42 | For example, on a worker `Node`, there is no `kubeconfig` available to use with `kubectl` - but `kubelet` has its own config file that it uses to communicate with the cluster master: 43 | 44 | - `cat /etc/kubernetes/kubelet.conf` 45 | - `k --kubeconfig /etc/kubernetes/kubelet.conf get node` 46 | 47 | ## Auth 48 | 49 | - `k auth can-i delete deployment -A` 50 | 51 | By extracting the Kubernetes server `ca` and the user `cert` and `key` from `kubectl config view --raw`, you can actually make manual API calls against the Kubernetes API: 52 | 53 | - `curl https://10.142.0.2:6443 --cacert ca --cert cert --key key` 54 | 55 | ## Certificates 56 | 57 | Certificates live at `/etc/kubernetes/pki` on the server. You can also find similar information on clients. 58 | 59 | - `/etc/kubernetes/pki` 60 | - `openssl x509 -in apiserver.crt -text` 61 | 62 | ## Node Restriction Admission Controller 63 | 64 | In `/etc/kubernetes/manifests/kube-apiserver.yaml`: 65 | 66 | ```yaml 67 | - --enable-admission-plugins=NodeRestriction 68 | ``` 69 | 70 | This sets the `NodeRestriction` admission controller to enabled. This means that requests are subject to it. For example, this prevents us from labeling the master `Node` from the worker: 71 | 72 | - `k label node cks-master cks/test=yes` 73 | 74 | This will fail - however, we can label our own `Node`: 75 | 76 | - `k label node cks-worker cks/test=yes` 77 | 78 | There are restricted labels that we cannot set ourselves: 79 | 80 | - `node-restriction.kubernetes.io/test=yes` - you couldn't set this. 81 | 82 | This would prevent a malicious user from changing a label like this to allow `Pods` that look for that label to instead run on a compromised `Node`. 83 | 84 | ## Updates 85 | 86 | It's good to update for support, security fixes, bug fixes, and dependencies. 87 | 88 | `1.19.2` - major/minor/patch 89 | 90 | Minor version every 3 months. No Long Term Support. 91 | 92 | Maintenance release branches for the most recent three minor releases - for now, that's `1.19`, `1.18`, and `1.17`. 93 | 94 | #### Update Process 95 | 96 | First, master components are upgraded - `apiserver`, `controller-manager`, `scheduler`. 97 | 98 | Then, worker components are upgraded - `kubelet`, `kube-proxy`. 99 | 100 | Components should always have the same minor version as the `apiserver`. 101 | 102 | `kubelet` can be two minor versions below `apiserver`, but in general don't do this. 103 | 104 | Stick with same version as `apiserver` or one below for safety. 105 | 106 | ##### The Process 107 | 108 | - `kubectl cordon` then `kubectl drain` 109 | - Upgrade 110 | - `kubectl uncordon` 111 | 112 | ###### Master 113 | 114 | ```bash 115 | $ k drain cks-master --ignore-daemonsets 116 | $ apt-cache show kubeadm | grep 1.19 117 | $ apt-get install kubeadm=1.19.3-00 kubelet=1.19.3-00 kubectl=1.19.3-00 118 | $ kubeadm upgrade plan 119 | $ kubeadm upgrade apply v1.19.3 120 | $ k uncordon cks-master 121 | ``` 122 | 123 | ###### Worker 124 | 125 | ```bash 126 | $ k drain cks-worker --ignore-daemonsets (from master) 127 | $ apt-cache show kubeadm | grep 1.19 128 | $ apt-get install kubeadm=1.19.3-00 kubelet=1.19.3-00 kubectl=1.19.3-00 129 | $ systemctl restart kubelet 130 | $ k uncordon cks-worker (from master) 131 | ``` 132 | 133 | ###### Application Resiliency 134 | 135 | As always, applications should be able to survive an upgrade: 136 | 137 | - `Pod` termination grace periods. 138 | - `PodDisruptionBudgets` 139 | - Pod Lifecycle Events 140 | 141 | ## Secrets 142 | 143 | Usually passwords, API keys, information needed by an application. 144 | 145 | ## Container Runtime 146 | 147 | `kubelet` args: 148 | 149 | - `--container-runtime` 150 | - `--container-runtime-endpoint` 151 | 152 | `crictl` is an open source adaption that is container and Kubernetes native. 153 | 154 | Kata containers adds an additional virtualization layer - a bit more like traditional VMs. 155 | 156 | gVisor (Google) (`runsc`) implements a limited-use kernel that adds further fine grained separation. Runs in user space so it's separated from the Linux kernel. 157 | 158 | `RuntimeClasses` allow you to specify further runtime environments for objects. 159 | 160 | You use `spec.runtimeClassName` to associate a `Pod` with a given `RuntimeClass`. 161 | 162 | ## Security Contexts 163 | 164 | ```yaml 165 | spec: 166 | volumes: 167 | - name: vol 168 | emptyDir: {} 169 | securityContext: 170 | runAsUser: 1000 171 | runAsGroup: 3000 172 | fsGroup: 2000 173 | containers: 174 | - name: foo 175 | command: 176 | - sh 177 | - c 178 | - sleep 1d 179 | image: busybox 180 | resources: {} 181 | securityContext: 182 | runAsUser: 0 183 | ``` 184 | 185 | Notice how here, `securityContext` is set top level for all `Pods`, but you can override for a specific container. 186 | 187 | Check out API reference from docs if you want to know more about what a flag does. 188 | 189 | Forcing running as non-root: 190 | 191 | ```yaml 192 | spec: 193 | volumes: 194 | - name: vol 195 | emptyDir: {} 196 | securityContext: 197 | runAsUser: 1000 198 | runAsGroup: 3000 199 | fsGroup: 2000 200 | containers: 201 | - name: foo 202 | command: 203 | - sh 204 | - c 205 | - sleep 1d 206 | image: busybox 207 | resources: {} 208 | securityContext: 209 | runAsNonRoot: true 210 | ``` 211 | 212 | This may error if the image runs as root and there is no top-level `securityContext`: `Error: container has runAsNonRoot and image will run as root` 213 | 214 | ## Privileged Containers 215 | 216 | ```yaml 217 | spec: 218 | volumes: 219 | - name: vol 220 | emptyDir: {} 221 | containers: 222 | - name: foo 223 | command: 224 | - sh 225 | - c 226 | - sleep 1d 227 | image: busybox 228 | resources: {} 229 | securityContext: 230 | privileged: true 231 | ``` 232 | 233 | Setting `privileged: true` will allow the given `Pod` to make OS-level changes, `sysctl` etc., and interact with the kernel. This is bad practice. 234 | 235 | Privileged means the container user `root (0)` is directly mapped to host `root (0)`. 236 | 237 | ## Privilege Escalation 238 | 239 | Privilege Escalation controls whether a process can gain more privileges than its parent process. 240 | 241 | ```yaml 242 | spec: 243 | volumes: 244 | - name: vol 245 | emptyDir: {} 246 | containers: 247 | - name: foo 248 | command: 249 | - sh 250 | - c 251 | - sleep 1d 252 | image: busybox 253 | resources: {} 254 | securityContext: 255 | allowPrivilegeEscalation: true 256 | ``` 257 | 258 | `allowPrivilegeEscalation: true` is the default. You can set to `false` to disable this behavior. 259 | 260 | ## Pod Security Policies 261 | 262 | Cluster-level resources. Controls under which security conditions a `Pod` has to run. 263 | 264 | Pod Security Policy runs as an admission controller. A `Pod` will only be created if it adheres to these rules. It inspects the security contexts. 265 | 266 | Enabling this will deny all `Pods` from being created in the cluster since out of the box none of the `ServiceAccounts` have the necessary permissions to look at `PodSecurityPolicies`. 267 | 268 | As an admin user, we can create `Pods`, but they wouldn't create as a result of a `Deployment`, as an example, since that creates using the `ServiceAccount` we talked about before. 269 | 270 | You'd have to give the `ServiceAccount`, in this case the `default`, the ability to evaluate `PodSecurityPolicies`: 271 | 272 | - `k create role psp-access --verb=use --resource=podsecuritypolicies` 273 | - `k create rolebinding psp-access --role=psp-access --serviceaccount=default:default` 274 | 275 | You'd want to create the proper RBAC and `PodSecurityPolicy` resources before enabling this. 276 | 277 | ## Mutual TLS (mTLS) 278 | 279 | Two-way bilateral authentication. Two parties authenticate to each other to create a secure communication channel. 280 | 281 | By default, all `Pods` can communicate with each other as an implicit function of the chosen CNI. 282 | 283 | Typically, TLS is terminated at an `Ingress` and is unencrypted on the backend in the cluster, between `Pods`, etc. 284 | 285 | You could use a sidcar proxy container to handle this encryption overhead. 286 | 287 | Something like Istio works with this model - a managed proxy that takes care of certificates. 288 | 289 | You could use an `initContainer` that creates `iptables` rules that would force all traffic from your application's `Pods` through the proxy container. 290 | 291 | All containers in a `Pod` have access to the same network namespace provided you add the capability as given in the example. 292 | 293 | ## Open Policy Agent 294 | 295 | Open source, general purpose policy engine. You've used this before. It works in Kubernetes too. 296 | 297 | Uses rego in Kubernetes the same way. Works with JSON/YAML. It is not natively Kubernetes aware with its resources, for example. 298 | 299 | OPA Gatekeeper creates CRDs to allow Kubernetes to work with OPA. 300 | 301 | ```yaml 302 | apiVersion: templates.gatekeeper.sh/v1beta1 303 | kind: ConstraintTemplate 304 | metadata: 305 | name: k8srequiredlabels 306 | ``` 307 | 308 | ```yaml 309 | apiVersion: constraints.gatekeeper.sh/v1beta1 310 | kind: K8sRequiredLabels 311 | metadata: 312 | name: pod-must-have-foolabel 313 | --- 314 | apiVersion: constraints.gatekeeper.sh/v1beta1 315 | kind: K8sRequiredLabels 316 | metadata: 317 | name: pod-must-have-barlabel 318 | ``` 319 | 320 | It creates admission webhooks - there are two kinds - `validating` and `mutating`. Mutating webhooks are invoked first and can modify objects. After this is done, Validating webhooks are invoked and can reject requests to enforce custom policies. 321 | 322 | Use Rego playground to test OPA policies. 323 | 324 | ## Image Footprints 325 | 326 | Remember that containers use syscalls against the host OS kernel. It is possible to run containers using the same PID so they share processes. 327 | 328 | Docker images are built using layers. Only the instructions `RUN`, `COPY`, and `ADD` create layers. Other instructions create temporary intermediate images, and do not increase the size of the build. 329 | 330 | An image size factors in the base image size plus additional layers, etc. 331 | 332 | ```Dockerfile 333 | FROM ubuntu 334 | ARG DEBIAN_FRONTEND=noninteractive 335 | RUN apt-get update && apt-get install -y golang-go 336 | COPY app.go . 337 | RUN CGO_ENABLED=0 go build app.go 338 | 339 | FROM alpine 340 | RUN chmod a-w /etc 341 | RUN addgroup -S appgroup && adduser -S appuser -G appgroup -h /home/appuser 342 | RUN rm -rf /bin/* 343 | COPY --from=0 /app /home/appuser 344 | USER appuser 345 | CMD ["./app"] 346 | ``` 347 | 348 | Using this logic, the `--from=0` here step copies from the first stage, stage 0, and stage 1 at the bottom copies the file into the local directory in the resultant final image. 349 | 350 | This essentially gives you a resulting image that only encompasses the final stage, reducing its size. 351 | 352 | ### Hardening An Image 353 | 354 | - Using specific versions in a Dockerfile is more secure instead of using something like `latest`. 355 | - Don't run as root in a container. In the example above, we establish a dedicated user and then run as that user by calling `USER`. 356 | - Making the filesystem read only is also more secure. This avoids allowing write access to a given filesystem running as part of a container. Using the line `RUN chmod a-w /etc`, we remove write permissions for the `/etc` directory for all users. 357 | - Removing shell access is also an optimization. We do this in the Dockerfile by running `RUN rm -rf /bin/*` - this essentially removes the ability to run anything located in that directory. This explains why sometimes you cannot exec into a container (think top level Kubernetes `Pods`) because that is missing. 358 | - In general, running commands together in a layer, like `apt-get update && apt-get install` all in the same operation is better and decreases size. Also cleanup install packages in the build. 359 | 360 | ## Static Analysis 361 | 362 | Looks at the source code and text files and parses them to check against rules. Those rules can then be enforced. 363 | 364 | Examples: 365 | 366 | - Always define resource requests and limits. 367 | - `Pods` should never use the default `ServiceAccount`. 368 | - Don't store sensitive data in plain text in Dockerfiles or Kubernetes resources. 369 | 370 | Twistlock or Sysdig is an example of this. 371 | 372 | You could do this in an image build phase or after the build phase in a test phase. We do this by leveraging the Sysdig inline scanner. 373 | 374 | `PodSecurityPolicies` and OPA can be used within the cluster for static analysis. 375 | 376 | For example, pulling info from a `Secret` as an environment variable is more secure than hardcoding things. Look out for obvious stuff. 377 | 378 | Kubesec is risk analysis for Kubernetes resources. It's open source and opinionated. It runs using a fixed set of rules based on security best practices. It can run as binary, a Docker container, a `kubectl` plugin, or admission controller. Remember that tools like Sysdig also offer an admission controller. You can also past a manifest into Kubesec to evaluate it on demand. 379 | 380 | Remember these important features for `securityContext`: 381 | - `readOnlyRootfilesystem = true` 382 | - `runAsNonRoot = true` 383 | - `runAsUser -gt 10000` 384 | - `capabilities .drop` 385 | 386 | OPA offers a tool called `conftest` that uses the same OPA rego language. You can run this in Docker as well. 387 | 388 | You can also use `confest` against Dockerfiles. 389 | 390 | ## Image Vulnerability Scanning 391 | 392 | Containers that contain exploitable packages are a problem. This could result in privilege escalation, data leaks, DDoS, etc. 393 | 394 | https://cve.mitre.org/ 395 | https://nvd.nist.gov/ 396 | 397 | Tools use these databases to scan images for vulnerabilities. We use the Sysdig inline scanner for this. You could stop a build, or use an Admission Controller to not allow a compromised image version to run in a cluster. 398 | 399 | You could also restrict based on registry hostnames within the cluster using something like OPA or `PodSecurityPolicies`. This would happen either in `MutatingAdmission` or `Validating` webhook stages. 400 | 401 | This type of scanning could also take place in a container registry like GCR or ECR. 402 | 403 | ### Clair 404 | 405 | Open source vulnerability assessment tool. CNCF supported. Uses vulnerability databases. 406 | 407 | ### Trivy 408 | 409 | Also open source. One command to run it. 410 | 411 | `$ docker run ghcr.io/aquasecurity/trivy:latest image nginx` 412 | 413 | This will cross reference failures with CVE numbers. 414 | 415 | ## Supply Chain Security 416 | 417 | Using a private registry is an example of a secure supply chain component. 418 | 419 | You can create a `docker-registry` `Secret` in Kubernetes and then associate the `imagePullSecrets` to the `ServiceAccounts`, for example. 420 | 421 | You can run an image using an image digest instead of a tag since tags can theoretically change and point to different digests. 422 | 423 | You could use OPA via the Admission Controller to limit images to specific repositories. 424 | 425 | Remember that you create a kind of `ConstraintTemplate` that uses `spec.crd.spec.names[0].kind=K8sTrustedImages` then create a `K8sTrustedImages` object. The admission webhook would then deny creation of a `Pod` if its spec fails the checks specified by these templates. 426 | 427 | `ImagePolicyWebhook` creates a kind of `ImageReview` which can be assessed by an external tool as part of an admission workflow. 428 | 429 | You would add this to `--enable-admission-plugins` as `ImagePolicyWebhook` to enable the Admission Controller. 430 | 431 | - `--admission-control-config-file=path-to-admission-config.yaml` 432 | 433 | You have to adjust Volumes to mount this data in the `kube-apiserver` `Pod`. You must have certificates, a `kubeconfig`, etc. 434 | 435 | `AdmissionConfiguration` kind is where this logic is configured. Remember that `defaultAllow` is set to `false` by default which means no `Pods` will create out of the box if this configuration is incomplete. 436 | 437 | ## Behavioral Analytics 438 | 439 | Syscall interface is provided by the kernel, for example `getpid()` or `reboot()`. 440 | 441 | Applications run in the user space. Applications can communicate with the syscall interface or they can use libraries. The request is then passed to the kernel and the hardware. 442 | 443 | `seccomp` and AppArmor lie between the user space and syscall interface for added protection. 444 | 445 | Processes in a container are able to communicate with the kernel given how they run in shared spaces. They're namespaced via container logic, but can still talk to the kernel. 446 | 447 | `strace` intercepts and logs system calls made by a process. It can also log and display signals received by a process so it's good for debugging etc. 448 | 449 | - `$ strace ls /` 450 | 451 | This would provide a list of syscalls made to the kernel. 452 | 453 | In the end, all commands run on the command line result in syscalls for how they operate. 454 | 455 | - `/proc` directory contains information and connections to processes and kernel. 456 | - Study it to learn how processes work. 457 | - Configuration and administrative tasks. 458 | - Contains files that don't technically exist. 459 | 460 | You can do `docker ps | grep etcd` and then `ps aux | grep etcd` to find the running `etcd` process. 461 | 462 | - `$ strace -p 3502 -f` - this would show you syscalls made by a process, in this case `etcd`. `-f` follows it. 463 | 464 | You can go to `cd /proc/3502` and `ls` and see open files related to a process - `etcd` in this case. 465 | 466 | Writes to `etcd` for things like `Secrets` will show up in `/proc/3502/fd/7 (symlink)` as an example. 467 | 468 | - `$ pstree -p` shows process tree of running processes. You could find `containerd` in here and see a list of running containers. 469 | 470 | You could use this to find the `pid` of a running container, navigate to its `/proc/pid` folder and `cat environ` and see environment variable data. 471 | 472 | ### Falco 473 | 474 | CNCF native runtime security. Deep kernel tracing built on the Linux kernel. 475 | 476 | Describe security rules against a system, detect unwanted behavior. 477 | 478 | Automated response to security violations. 479 | 480 | Kubernetes docs has a page specific to Falco with instructions for installation. 481 | 482 | `/etc/falco` where configuration files are stored. 483 | 484 | - `$ tail -f /var/log/syslog | grep falco` 485 | 486 | You would see log output from `falco` activity that shows running processes, package management launched, shell executions, etc. 487 | 488 | Know how to find those rules and review them. 489 | 490 | ## Immutability of Containers at Runtime 491 | 492 | Immutability simply means the container won't be modified during its lifetime. 493 | 494 | ### Mutable 495 | 496 | - `ssh` to a container, stop application, update application, start application 497 | 498 | ### Immutable 499 | 500 | - Create new container image, delete container instance, create new container instance. 501 | 502 | With immutable containers, we always know the state. With mutable instances, we know less. 503 | 504 | Immutability allows us to use advanced deployment methods native to Kubernetes, easy rollbacks, more reliability, and better security. 505 | 506 | ### Making Containers Immutable 507 | 508 | - Remove bash/shell, make filesystem read only, run as user and non-root 509 | - You could remove write privileges to all non-essential directories in line using `command`. 510 | - `startupProbe` - runs prior to liveness/readiness checks. You could use this to enforce changes too. 511 | - Use `SecurityContexts` and `PodSecurityPolicies` to enforce read only fs, etc. 512 | - You could use an init container to handle read/write permissions and then harden the app container. 513 | 514 | ### StartupProbe 515 | 516 | ```yaml 517 | apiVersion: v1 518 | kind: Pod 519 | metadata: 520 | creationTimestamp: null 521 | labels: 522 | run: immutable 523 | name: immutable 524 | spec: 525 | containers: 526 | - image: httpd 527 | name: immutable 528 | resources: {} 529 | startupProbe: 530 | exec: 531 | command: 532 | - rm 533 | - /bin/touch 534 | initialDelaySeconds: 1 535 | periodSeconds: 5 536 | dnsPolicy: ClusterFirst 537 | restartPolicy: Always 538 | status: {} 539 | ``` 540 | 541 | ### SecurityContext 542 | 543 | ```yaml 544 | apiVersion: v1 545 | kind: Pod 546 | metadata: 547 | creationTimestamp: null 548 | labels: 549 | run: immutable 550 | name: immutable 551 | spec: 552 | containers: 553 | - image: httpd 554 | name: immutable 555 | resources: {} 556 | securityContext: 557 | readOnlyRootFilesystem: true 558 | volumeMounts: 559 | - mountPath: /usr/local/apache2/logs 560 | name: cache-volume 561 | volumes: 562 | - name: cache-volume 563 | emptyDir: {} 564 | dnsPolicy: ClusterFirst 565 | restartPolicy: Always 566 | status: {} 567 | ``` 568 | 569 | In this example, we use `readOnlyRootFilesystem` but `apache` will complain if it can't write to the directory it needs for logs. We get around this by using `emptyDir` to allow it to write to that temporary directory. 570 | 571 | ## Kubernetes Auditing 572 | 573 | Kubernetes is all based on API requests. Auditing allows us to hold those requests so that we can review them for security purposes. 574 | 575 | - Did someone access an important `Secret` while it was not protected? We can check who accessed it. 576 | - When was the last time that user X did access cluster Y? 577 | - Does my CRD work properly? 578 | 579 | Requests to the Kubernetes API run through stages: 580 | 581 | - `RequestReceived` - The stage for events generated as soon as the audit handler receives the request, and before it is delegated down the handler chain. 582 | - `ResponseStarted` - Once the response headers are sent, but before the response body is sent. This stage is only generated for long-running requests (e.g. `watch`). 583 | - `ResponseComplete` - The response body has been completed and no more bytes will be sent. 584 | - `Panic` - Events generated when a panic occurred. 585 | 586 | Using these stages, we can customize exactly what we want to log. 587 | 588 | Audit policy rule levels: 589 | 590 | - `None` - don't log events that match this rule. 591 | - `Metadata` - log request metadata (requesting user, timestamp, resource, verb, etc.) not not request or response body. 592 | - `Request` - log event metadata and request body but not response body. This does not apply for non-resource requests. 593 | - `RequestResponse` - log event metadata, request, and response bodies. This does not apply for non-resource requests. 594 | 595 | ```yaml 596 | apiVersion: audit.k8s.io/v1 597 | kind: Policy 598 | omitStages: 599 | - "RequestReceived" 600 | rules: 601 | # log no "read" actions 602 | - level: None 603 | verbs: ["get", "watch", "list"] 604 | 605 | # log nothing regarding events 606 | - level: None 607 | resources: 608 | - group: "" # core 609 | resources: ["events"] 610 | 611 | # log nothing coming from some groups 612 | - level: None 613 | userGroups: ["system:nodes"] 614 | - level: RequestResponse 615 | resources: 616 | - group: "" 617 | resources: ["secrets"] 618 | 619 | # for everything else log 620 | - level: Metadata 621 | ``` 622 | 623 | ### Configuring 624 | 625 | - `$ mkdir /etc/kubernetes/audit` 626 | - Create `policy.yaml` in the `audit` folder. 627 | - Edit `kube-apiserver.yaml` with the following flags: 628 | - `--audit-policy-file=/etc/kubernetes/audit/policy.yaml` 629 | - `--audit-log-path=/etc/kubernetes/audit/logs/audit.log` 630 | - `--audit-log-maxsize=500` 631 | - `--audit-log-maxbackup=5` 632 | - Edit `kube-apiserver.yaml` to mount the location in the `apiserver` `Pods` to store the logs: 633 | 634 | ```yaml 635 | volumeMounts: 636 | - mountPath: /etc/kubernetes/audit 637 | name: audit 638 | volumes: 639 | - hostPath: 640 | path: /etc/kubernetes/audit 641 | type: DirectoryOrCreate 642 | name: audit 643 | ``` 644 | 645 | Use the docs for auditing to help with this part. 646 | 647 | Backends for storing audit data can be JSON logs, external APIs (webhooks), dynamic backend (ElasticSearch, FileBeat, Fluentd). 648 | 649 | This file is largely use for parsing on your own. It should really be exported somewhere useful and readable. But you can do something like create a `Secret` and `grep` for it in the log file. 650 | 651 | ### Advanced Audit Policy 652 | 653 | ```yaml 654 | apiVersion: audit.k8s.io/v1 655 | kind: Policy 656 | omitStages: 657 | - 'RequestReceived' 658 | rules: 659 | # log no "read" actions 660 | - level: None 661 | verbs: ['get', 'watch', 'list'] 662 | 663 | # log only metadata from Secrets 664 | - level: Metadata 665 | resources: 666 | - group: '' 667 | resources: ['secrets'] 668 | 669 | # for everything else log 670 | - level: RequestResponse 671 | ``` 672 | 673 | If you create a policy and the `kube-apiserver` `Pod` doesn't come back up, remember the trick using `/var/log/pods` to see what's going on. You'll likely see more than one since the new one didn't start, so use the latest and read its logs. 674 | 675 | ### Looking at API Access History for Secrets 676 | 677 | ```yaml 678 | apiVersion: audit.k8s.io/v1 679 | kind: Policy 680 | omitStages: 681 | - 'RequestReceived' 682 | rules: 683 | # log no "read" actions 684 | - level: None 685 | verbs: ['get', 'watch', 'list'] 686 | 687 | # log only metadata from Secrets 688 | - level: RequestResponse 689 | resources: 690 | - group: '' 691 | resources: ['secrets'] 692 | 693 | # for everything else 694 | - level: RequestResponse 695 | ``` 696 | 697 | Commenting out the line for the file path for the policy is enough to cause the `apiserver` to restart. You'll have to uncomment it to get it to restart again and use your new policy. 698 | 699 | You could use this to see API calls for things - including patching objects, etc. 700 | 701 | ## Kernel Hardening Tools 702 | 703 | Processes are restricted in namespaces which restrict what they can see - users, filesystems, other processes. `cgroups` restrict the resource usage of processes. 704 | 705 | Between the syscall interface and top level spaces like user and app space, we can use AppArmor and `seccomp` to harden the kernel. 706 | 707 | ### AppArmor 708 | 709 | You create profiles in AppArmor to define what a process can and cannot do. You could do this for something like `kubelet` or any other app. 710 | 711 | Profiles have the concept of modes: 712 | 713 | - `Unconfined` - process can escape, nothing is enforced. 714 | - `Complain` - processes can escape but will be logged. 715 | - `Enforce` - processes cannot escape, cannot do more than we allow them to do in their profile. 716 | 717 | Commands: 718 | 719 | - `aa-status` - show profiles 720 | - `aa-genprof` - create new profile (smart wrapper around `aa-logprof`) 721 | - `aa-complain` - put profile in complain mode 722 | - `aa-enforce` - put profile in enforce mode 723 | - `aa-logprof` - update the profile if app produced some more usage logs (syslog) 724 | 725 | You could do this for `curl`: `$ aa-genprof curl` 726 | 727 | Profiles are located in `/etc/apparmor.d`. 728 | 729 | You could then run `aa-logprof` to review recommended changes and then save them so your processes are more secure. 730 | 731 | #### With Docker 732 | 733 | `/etc/apparmor.d` is where policies go. 734 | 735 | For example `/etc/apparmor.dr/docker-nginx` 736 | 737 | `$ apparmor_parser path-to-file` adds it. 738 | 739 | `$ docker run --security-opt apparmor=docker-nginx` 740 | 741 | #### With Kubernetes 742 | 743 | Container runtime need sto support AppArmor. Docker does by default. 744 | 745 | AppArmor must be installed on every node, and the profiles need to be available on every node. 746 | 747 | AppArmor profiles are specified **per container**, not per `Pod`. 748 | 749 | ```yaml 750 | apiVersion: v1 751 | kind: Pod 752 | metadata: 753 | annotations: 754 | container.apparmor.security.beta.kubernetes.io/secure: localhost/docker-nginx # you reference the name of the pod after / and the profile as the value 755 | labels: 756 | run: secure 757 | name: secure 758 | spec: 759 | containers: 760 | - image: nginx 761 | name: secure 762 | resources: {} 763 | dnsPolicy: ClusterFirst 764 | restartPolicy: Always 765 | status: {} 766 | ``` 767 | 768 | The profile must exist in order for the `Pod` to create, otherwise its creation will be `Blocked`. 769 | 770 | Annotations for AppArmor can be found in Kubernetes docs. 771 | 772 | ### seccomp 773 | 774 | "Security computing mode." 775 | 776 | Security facility in the Linux kernel. 777 | 778 | Restricts execution of syscalls made by processes. 779 | 780 | By default, applications can make syscalls as needed. Using `seccomp`, we can restrict syscalls. 781 | 782 | It onlyallows `exit()`, `sigreturn()`, `read()`, and `write()` by default. 783 | 784 | Nowadays `seccomp` is combined with BPF to make `seccomp-bpf`. 785 | 786 | #### With Docker 787 | 788 | `$ docker run --security-opt seccomp=default.json nginx` 789 | 790 | #### With Kubernetes 791 | 792 | In order for kubelet to use `seccomp` profiles, you can consult the Kubernetes docs and find the argument for specifying the directory, which is `/var/lib/kubelet/seccomp` by default. 793 | 794 | You can create the directory and then add your profile to it. 795 | 796 | ```yaml 797 | apiVersion: v1 798 | kind: Pod 799 | metadata: 800 | #annotations: 801 | # seccomp.security.alpha.kubernetes.io/secure: localhost/profiles/default.json # pre 1.19 you could use this annotation 802 | labels: 803 | run: secure 804 | name: secure 805 | spec: 806 | # in 1.19+, you enable seccomp using securityContext 807 | securityContext: 808 | seccompProfile: 809 | type: Localhost 810 | localhostProfile: default.json 811 | containers: 812 | - image: nginx 813 | name: secure 814 | resources: {} 815 | dnsPolicy: ClusterFirst 816 | restartPolicy: Always 817 | status: {} 818 | ``` 819 | 820 | Like AppArmor, if the profile doesn't exist, `Pod` will not be created with status `CreateContainerError`. Running `describe` will show this error. 821 | 822 | ## Reduce Attack Surface of Host 823 | 824 | Attack surface consists of anything exposed that can be sourced by malicious code. Networking, applications, IAM, etc. 825 | 826 | Applications and kernel should always be up to date. Additional packages should not be present if not needed. 827 | 828 | Use a firewall to eliminate access to a port. 829 | 830 | Run applications as a specific user instead of root to avoid privilege escalation. 831 | 832 | `Nodes` should be ephemeral. They should be created from images and immutable. They should be recycled efficiently and without downtime. 833 | 834 | A lot of included services with base level OS images, like `ubuntu`, are unnecessary and increase attack surface. 835 | 836 | - `$ netstat -plnt` 837 | - `$ lsof -i :22` 838 | - `$ systemctl status kubelet` 839 | - `$ ps aux` 840 | 841 | For example, to disable a service: 842 | 843 | - `$ systemctl list-units --type=service --state=running | grep snapd` 844 | - `$ systemctl stop snapd` 845 | - `$ systemctl disable snapd` 846 | 847 | Show users: 848 | 849 | - `$ cat /etc/passwd` 850 | 851 | Add users: 852 | 853 | - `$ adduser foo` 854 | 855 | Delete users: 856 | 857 | - `$ deluser foo` 858 | 859 | Show bash processes: 860 | 861 | - `$ ps aux | grep bash` 862 | 863 | ## Handy Stuff 864 | 865 | - `/var/log/pods` - You can see `Pod` logs here if you're unable to use the API for some reason. 866 | --------------------------------------------------------------------------------