├── .github └── workflows │ └── auto-approve.yml ├── README.md ├── agents.yaml ├── compile-agents-yaml.sh ├── heapster.yaml ├── hooks └── pre-commit ├── logging-agent.yaml ├── metadata-agent.yaml └── rbac-setup.yaml /.github/workflows/auto-approve.yml: -------------------------------------------------------------------------------- 1 | # Automatically approve PRs created by our release robot account. 2 | name: Auto approve 3 | on: pull_request 4 | 5 | jobs: 6 | auto-approve: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: hmarr/auto-approve-action@v2.0.0 10 | if: github.actor == 'stackdriver-instrumentation-release' 11 | with: 12 | github-token: "${{ secrets.GITHUB_TOKEN }}" 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stackdriver Kubernetes Configs 2 | 3 | A collection of Kubernetes configurations to integrate with Stackdriver 4 | products. For now, it supports [Stackdriver Logging](https://cloud.google.com/logging/) 5 | and [Stackdriver Monitoring](https://cloud.google.com/monitoring/). This repo is only used 6 | for [manual installation](https://cloud.google.com/monitoring/kubernetes-engine/customizing) 7 | on existing clusters. 8 | 9 | ## Setting up git hooks 10 | 11 | From the root directory of this repo, please run the following command: 12 | 13 | ``` 14 | ln -s "$(realpath hooks/pre-commit)" "$(git rev-parse --git-dir)/hooks/pre-commit" 15 | ``` 16 | 17 | This will ensure that all commits run the 18 | [`compile-agents-yaml.sh`](#compile-agents-yaml) command. 19 | 20 | ## Compile Agents YAML 21 | 22 | From the root directory of this repo, you can run the following command: 23 | 24 | ``` 25 | ./compile-agents-yaml.sh 26 | ``` 27 | 28 | This will ensure that all commits re-generate the `agents.yaml` file to keep it 29 | up-to-date with other YAML file changes. 30 | 31 | -------------------------------------------------------------------------------- /agents.yaml: -------------------------------------------------------------------------------- 1 | # THIS FILE IS AUTO-GENERATED DO NOT EDIT 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | labels: 6 | k8s-app: stackdriver-heapster 7 | version: v1.6.1 8 | name: heapster 9 | namespace: stackdriver-agents 10 | spec: 11 | replicas: 1 12 | selector: 13 | matchLabels: 14 | k8s-app: stackdriver-heapster 15 | strategy: 16 | rollingUpdate: 17 | maxSurge: 1 18 | maxUnavailable: 1 19 | type: RollingUpdate 20 | template: 21 | metadata: 22 | creationTimestamp: null 23 | labels: 24 | k8s-app: stackdriver-heapster 25 | version: v1.6.1 26 | spec: 27 | containers: 28 | - env: 29 | - name: CLUSTER_NAME 30 | valueFrom: 31 | configMapKeyRef: 32 | name: cluster-config 33 | key: cluster_name 34 | - name: CLUSTER_LOCATION 35 | valueFrom: 36 | configMapKeyRef: 37 | name: cluster-config 38 | key: cluster_location 39 | - name: GOOGLE_APPLICATION_CREDENTIALS 40 | valueFrom: 41 | configMapKeyRef: 42 | name: google-cloud-config 43 | key: credentials_path 44 | command: 45 | - /heapster 46 | - --source=kubernetes.summary_api:https://kubernetes.default?kubeletHttps=true&kubeletPort=10250&insecure=true 47 | - --sink=stackdriver:?cluster_name=$(CLUSTER_NAME)&cluster_location=$(CLUSTER_LOCATION)&zone=$(CLUSTER_LOCATION)&use_old_resources=false&use_new_resources=true&min_interval_sec=100&batch_export_timeout_sec=110 48 | image: gcr.io/stackdriver-agents/heapster-amd64:v1.6.1 49 | imagePullPolicy: Always 50 | livenessProbe: 51 | failureThreshold: 3 52 | httpGet: 53 | path: /healthz 54 | port: 8082 55 | scheme: HTTP 56 | initialDelaySeconds: 180 57 | periodSeconds: 10 58 | successThreshold: 1 59 | timeoutSeconds: 5 60 | name: heapster 61 | resources: 62 | limits: 63 | cpu: 88m 64 | memory: 204Mi 65 | requests: 66 | cpu: 88m 67 | memory: 204Mi 68 | terminationMessagePath: /dev/termination-log 69 | terminationMessagePolicy: File 70 | volumeMounts: 71 | - mountPath: /etc/google-cloud/ 72 | name: google-cloud-config 73 | - command: 74 | - /pod_nanny 75 | - --cpu=80m 76 | - --extra-cpu=0.5m 77 | - --memory=140Mi 78 | - --extra-memory=4Mi 79 | - --threshold=5 80 | - --deployment=heapster 81 | - --container=heapster 82 | - --poll-period=300000 83 | - --estimator=exponential 84 | env: 85 | - name: MY_POD_NAME 86 | valueFrom: 87 | fieldRef: 88 | apiVersion: v1 89 | fieldPath: metadata.name 90 | - name: MY_POD_NAMESPACE 91 | valueFrom: 92 | fieldRef: 93 | apiVersion: v1 94 | fieldPath: metadata.namespace 95 | image: gcr.io/google_containers/addon-resizer:1.7 96 | imagePullPolicy: IfNotPresent 97 | name: heapster-nanny 98 | resources: 99 | limits: 100 | cpu: 50m 101 | memory: 112360Ki 102 | requests: 103 | cpu: 50m 104 | memory: 112360Ki 105 | terminationMessagePath: /dev/termination-log 106 | terminationMessagePolicy: File 107 | dnsPolicy: ClusterFirst 108 | restartPolicy: Always 109 | schedulerName: default-scheduler 110 | securityContext: {} 111 | serviceAccount: heapster 112 | serviceAccountName: heapster 113 | terminationGracePeriodSeconds: 30 114 | volumes: 115 | - configMap: 116 | defaultMode: 420 117 | name: google-cloud-config 118 | name: google-cloud-config 119 | 120 | --- 121 | apiVersion: apps/v1 122 | kind: DaemonSet 123 | metadata: 124 | labels: 125 | app: stackdriver-logging-agent 126 | name: stackdriver-logging-agent 127 | namespace: stackdriver-agents 128 | spec: 129 | selector: 130 | matchLabels: 131 | app: stackdriver-logging-agent 132 | template: 133 | metadata: 134 | labels: 135 | app: stackdriver-logging-agent 136 | spec: 137 | containers: 138 | - env: 139 | - name: NODE_NAME 140 | valueFrom: 141 | fieldRef: 142 | apiVersion: v1 143 | fieldPath: spec.nodeName 144 | - name: K8S_NODE_NAME 145 | valueFrom: 146 | fieldRef: 147 | apiVersion: v1 148 | fieldPath: spec.nodeName 149 | - name: GOOGLE_APPLICATION_CREDENTIALS 150 | valueFrom: 151 | configMapKeyRef: 152 | name: google-cloud-config 153 | key: credentials_path 154 | - name: CLUSTER_NAME 155 | valueFrom: 156 | configMapKeyRef: 157 | name: cluster-config 158 | key: cluster_name 159 | - name: CLUSTER_LOCATION 160 | valueFrom: 161 | configMapKeyRef: 162 | name: cluster-config 163 | key: cluster_location 164 | image: gcr.io/stackdriver-agents/stackdriver-logging-agent:1.10.3 165 | imagePullPolicy: IfNotPresent 166 | livenessProbe: 167 | exec: 168 | command: 169 | - /bin/sh 170 | - -c 171 | - | 172 | LIVENESS_THRESHOLD_SECONDS=${LIVENESS_THRESHOLD_SECONDS:-300}; STUCK_THRESHOLD_SECONDS=${LIVENESS_THRESHOLD_SECONDS:-900}; if [ ! -e /var/run/google-fluentd/buffers ]; then 173 | exit 1; 174 | fi; touch -d "${STUCK_THRESHOLD_SECONDS} seconds ago" /tmp/marker-stuck; if [[ -z "$(find /var/run/google-fluentd/buffers -type f -newer /tmp/marker-stuck -print -quit)" ]]; then 175 | rm -rf /var/run/google-fluentd/buffers; 176 | exit 1; 177 | fi; touch -d "${LIVENESS_THRESHOLD_SECONDS} seconds ago" /tmp/marker-liveness; if [[ -z "$(find /var/run/google-fluentd/buffers -type f -newer /tmp/marker-liveness -print -quit)" ]]; then 178 | exit 1; 179 | fi; 180 | failureThreshold: 3 181 | initialDelaySeconds: 600 182 | periodSeconds: 60 183 | successThreshold: 1 184 | timeoutSeconds: 1 185 | name: logging-agent 186 | resources: 187 | limits: 188 | cpu: "1" 189 | memory: 300Mi 190 | requests: 191 | cpu: 100m 192 | memory: 200Mi 193 | terminationMessagePath: /dev/termination-log 194 | terminationMessagePolicy: File 195 | volumeMounts: 196 | - mountPath: /var/run 197 | name: varrun 198 | - mountPath: /var/log 199 | name: varlog 200 | - mountPath: /var/lib/docker/containers 201 | name: varlibdockercontainers 202 | readOnly: true 203 | - mountPath: /etc/google-fluentd/google-fluentd.conf 204 | subPath: google-fluentd.conf 205 | name: output-config-volume 206 | - mountPath: /etc/google-fluentd/config.d 207 | name: input-config-volume 208 | - mountPath: /etc/google-cloud/ 209 | name: google-cloud-config 210 | serviceAccount: logging-agent 211 | serviceAccountName: logging-agent 212 | dnsPolicy: ClusterFirst 213 | restartPolicy: Always 214 | schedulerName: default-scheduler 215 | securityContext: {} 216 | tolerations: 217 | - operator: "Exists" 218 | effect: "NoExecute" 219 | - operator: "Exists" 220 | effect: "NoSchedule" 221 | volumes: 222 | - hostPath: 223 | path: /var/run 224 | type: "" 225 | name: varrun 226 | - hostPath: 227 | path: /var/log 228 | type: "" 229 | name: varlog 230 | - hostPath: 231 | path: /var/lib/docker/containers 232 | type: "" 233 | name: varlibdockercontainers 234 | - configMap: 235 | defaultMode: 420 236 | name: logging-agent-output-config 237 | name: output-config-volume 238 | - configMap: 239 | defaultMode: 420 240 | name: logging-agent-input-config 241 | name: input-config-volume 242 | - configMap: 243 | defaultMode: 420 244 | name: google-cloud-config 245 | name: google-cloud-config 246 | updateStrategy: 247 | rollingUpdate: 248 | maxUnavailable: 1 249 | type: RollingUpdate 250 | --- 251 | # Config map for Logging Agent input and corresponding filter plugins. 252 | apiVersion: v1 253 | kind: ConfigMap 254 | metadata: 255 | name: logging-agent-input-config 256 | namespace: stackdriver-agents 257 | data: 258 | 1.containers.input.conf: |- 259 | # This configuration file for Fluentd is used 260 | # to watch changes to Docker log files that live in the 261 | # directory /var/lib/docker/containers/ and are symbolically 262 | # linked to from the /var/log/containers directory using names that capture the 263 | # pod name and container name. These logs are then submitted to 264 | # Google Cloud Logging which assumes the installation of the cloud-logging plug-in. 265 | # 266 | # Example 267 | # ======= 268 | # A line in the Docker log file might look like this JSON: 269 | # 270 | # {"log":"2014/09/25 21:15:03 Got request with path wombat\\n", 271 | # "stream":"stderr", 272 | # "time":"2014-09-25T21:15:03.499185026Z"} 273 | # 274 | # The original tag is derived from the log file's location. 275 | # For example a Docker container's logs might be in the directory: 276 | # /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b 277 | # and in the file: 278 | # 997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log 279 | # where 997599971ee6... is the Docker ID of the running container. 280 | # The Kubernetes kubelet makes a symbolic link to this file on the host 281 | # machine in the /var/log/containers directory which includes the pod name, 282 | # the namespace name and the Kubernetes container name: 283 | # synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log 284 | # -> 285 | # /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log 286 | # The /var/log directory on the host is mapped to the /var/log directory in the container 287 | # running this instance of Fluentd and we end up collecting the file: 288 | # /var/log/containers/synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log 289 | # This results in the tag: 290 | # var.log.containers.synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log 291 | # where 'synthetic-logger-0.25lps-pod' is the pod name, 'default' is the 292 | # namespace name, 'synth-lgr' is the container name and '997599971ee6..' is 293 | # the container ID. 294 | # The record reformer is used to extract pod_name, namespace_name and 295 | # container_name from the tag and set them in a local_resource_id in the 296 | # format of: 297 | # 'k8s_container...'. 298 | # The reformer also changes the tags to 'stderr' or 'stdout' based on the 299 | # value of 'stream'. 300 | # local_resource_id is later used by google_cloud plugin to determine the 301 | # monitored resource to ingest logs against. 302 | 303 | # Json Log Example: 304 | # {"log":"[info:2016-02-16T16:04:05.930-08:00] Some log text here\n","stream":"stdout","time":"2016-02-17T00:04:05.931087621Z"} 305 | # CRI Log Example: 306 | # 2016-02-17T00:04:05.931087621Z stdout F [info:2016-02-16T16:04:05.930-08:00] Some log text here 307 | 308 | @type tail 309 | path /var/log/containers/*.log 310 | pos_file /var/run/google-fluentd/pos-files/gcp-containers.pos 311 | # Tags at this point are in the format of: 312 | # reform.var.log.containers.__-.log 313 | tag reform.* 314 | read_from_head true 315 | 316 | @type multi_format 317 | 318 | format json 319 | time_key time 320 | time_format %Y-%m-%dT%H:%M:%S.%NZ 321 | 322 | 323 | format /^(? 326 | 327 | 328 | 329 | 330 | @type parser 331 | format /^(?\w)(? 337 | 338 | 339 | # This plugin uses environment variables KUBERNETES_SERVICE_HOST and 340 | # KUBERNETES_SERVICE_PORT to talk to the API server. These environment 341 | # variables are added by kubelet automatically. 342 | @type kubernetes_metadata 343 | # Interval in seconds to dump cache stats locally in the Fluentd log. 344 | stats_interval 300 345 | # TTL in seconds of each cached element. 346 | cache_ttl 30 347 | # Skip fetching unused metadata. 348 | skip_container_metadata true 349 | skip_master_url true 350 | skip_namespace_metadata true 351 | 352 | 353 | 354 | # We have to use record_modifier because only this plugin supports complex 355 | # logic to modify record the way we need. 356 | @type record_modifier 357 | enable_ruby true 358 | 359 | # Extract "kubernetes"->"labels" and set them as 360 | # "logging.googleapis.com/labels". Prefix these labels with 361 | # "k8s-pod" to distinguish with other labels and avoid 362 | # label name collision with other types of labels. 363 | _dummy_ ${if record.is_a?(Hash) && record.has_key?('kubernetes') && record['kubernetes'].has_key?('labels') && record['kubernetes']['labels'].is_a?(Hash); then; record["logging.googleapis.com/labels"] = record['kubernetes']['labels'].map{ |k, v| ["k8s-pod/#{k}", v]}.to_h; end; nil} 364 | 365 | # Delete this dummy field and the rest of "kubernetes" and "docker". 366 | remove_keys _dummy_,kubernetes,docker 367 | 368 | 369 | 370 | @type record_reformer 371 | enable_ruby true 372 | 373 | # Extract local_resource_id from tag for 'k8s_container' monitored 374 | # resource. The format is: 375 | # 'k8s_container...'. 376 | "logging.googleapis.com/local_resource_id" ${"k8s_container.#{tag_suffix[4].rpartition('.')[0].split('_')[1]}.#{tag_suffix[4].rpartition('.')[0].split('_')[0]}.#{tag_suffix[4].rpartition('.')[0].split('_')[2].rpartition('-')[0]}"} 377 | # Rename the field 'log' to a more generic field 'message'. This way the 378 | # fluent-plugin-google-cloud knows to flatten the field as textPayload 379 | # instead of jsonPayload after extracting 'time', 'severity' and 380 | # 'stream' from the record. 381 | message ${record['log']} 382 | # If 'severity' is not set, assume stderr is ERROR and stdout is INFO. 383 | severity ${record['severity'] || if record['stream'] == 'stderr' then 'ERROR' else 'INFO' end} 384 | 385 | tag ${if record['stream'] == 'stderr' then 'raw.stderr' else 'raw.stdout' end} 386 | remove_keys stream,log 387 | 388 | 389 | # Detect exceptions in the log output and forward them as one log entry. 390 | 391 | @type detect_exceptions 392 | 393 | remove_tag_prefix raw 394 | message message 395 | stream "logging.googleapis.com/local_resource_id" 396 | multiline_flush_interval 5 397 | max_bytes 500000 398 | max_lines 1000 399 | 400 | 2.pods.input.conf: |- 401 | # This configuration file for Fluentd is used 402 | # to watch changes to Kubernetes pod log files live in the 403 | # directory /var/log/pods/NAMESPACE_NAME_UID. The file name 404 | # is used to capture the pod namespace, name and uid. These 405 | # logs are then submitted to Google Cloud Logging with a 406 | # local_resource_id 'k8s_pod..' 407 | # which assumes the installation of the cloud-logging plug-in. 408 | 409 | @type tail 410 | path /var/log/pods/*/*.log 411 | pos_file /var/run/google-fluentd/pos-files/gcp-pods.pos 412 | # Tags at this point are in the format of: 413 | # pods.reform.var.log.pods.__..log 414 | tag pods.reform.* 415 | read_from_head true 416 | 417 | @type none 418 | 419 | 420 | 421 | @type record_reformer 422 | enable_ruby true 423 | 424 | # Extract local_resource_id from tag for 'k8s_pod' monitored 425 | # resource. The format is: 426 | # 'k8s_pod..'. 427 | "logging.googleapis.com/local_resource_id" ${"k8s_pod.#{tag_suffix[5].rpartition('.')[0].split('_')[0]}.#{tag_suffix[5].rpartition('.')[0].split('_')[1]}"} 428 | 429 | # Use the log file name as the tag. Currently only `gvisor` log is supported. 430 | tag ${"#{tag_suffix[5].rpartition('.')[0].rpartition('.')[2]}"} 431 | 432 | 7.system.input.conf: |- 433 | # Example: 434 | # Dec 21 23:17:22 gke-foo-1-1-4b5cbd14-node-4eoj startupscript: Finished running startup script /var/run/google.startup.script 435 | 436 | @type tail 437 | format syslog 438 | path /var/log/startupscript.log 439 | pos_file /var/run/google-fluentd/pos-files/gcp-startupscript.pos 440 | tag startupscript 441 | 442 | 443 | # Example: 444 | # I1118 21:26:53.975789 6 proxier.go:1096] Port "nodePort for kube-system/default-http-backend:http" (:31429/tcp) was open before and is still needed 445 | 446 | @type tail 447 | format multiline 448 | multiline_flush_interval 5s 449 | format_firstline /^\w\d{4}/ 450 | format1 /^(?\w)(?