├── .github
    └── workflows
    │   └── auto-approve.yml
├── README.md
├── agents.yaml
├── compile-agents-yaml.sh
├── heapster.yaml
├── hooks
    └── pre-commit
├── logging-agent.yaml
├── metadata-agent.yaml
└── rbac-setup.yaml


/.github/workflows/auto-approve.yml:
--------------------------------------------------------------------------------
 1 | # Automatically approve PRs created by our release robot account.
 2 | name: Auto approve
 3 | on: pull_request
 4 | 
 5 | jobs:
 6 |   auto-approve:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |     - uses: hmarr/auto-approve-action@v2.0.0
10 |       if: github.actor == 'stackdriver-instrumentation-release'
11 |       with:
12 |         github-token: "${{ secrets.GITHUB_TOKEN }}"
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Stackdriver Kubernetes Configs
 2 | 
 3 | A collection of Kubernetes configurations to integrate with Stackdriver
 4 | products. For now, it supports [Stackdriver Logging](https://cloud.google.com/logging/)
 5 | and [Stackdriver Monitoring](https://cloud.google.com/monitoring/). This repo is only used
 6 | for [manual installation](https://cloud.google.com/monitoring/kubernetes-engine/customizing) 
 7 | on existing clusters.
 8 | 
 9 | ## Setting up git hooks
10 | 
11 | From the root directory of this repo, please run the following command:
12 | 
13 | ```
14 | ln -s "$(realpath hooks/pre-commit)" "$(git rev-parse --git-dir)/hooks/pre-commit"
15 | ```
16 | 
17 | This will ensure that all commits run the
18 | [`compile-agents-yaml.sh`](#compile-agents-yaml) command.
19 | 
20 | ## Compile Agents YAML<a name="compile-agents-yaml"></a>
21 | 
22 | From the root directory of this repo, you can run the following command:
23 | 
24 | ```
25 | ./compile-agents-yaml.sh
26 | ```
27 | 
28 | This will ensure that all commits re-generate the `agents.yaml` file to keep it
29 | up-to-date with other YAML file changes.
30 | 
31 | 


--------------------------------------------------------------------------------
/agents.yaml:
--------------------------------------------------------------------------------
  1 | # THIS FILE IS AUTO-GENERATED DO NOT EDIT
  2 | apiVersion: apps/v1
  3 | kind: Deployment
  4 | metadata:
  5 |   labels:
  6 |     k8s-app: stackdriver-heapster
  7 |     version: v1.6.1
  8 |   name: heapster
  9 |   namespace: stackdriver-agents
 10 | spec:
 11 |   replicas: 1
 12 |   selector:
 13 |     matchLabels:
 14 |       k8s-app: stackdriver-heapster
 15 |   strategy:
 16 |     rollingUpdate:
 17 |       maxSurge: 1
 18 |       maxUnavailable: 1
 19 |     type: RollingUpdate
 20 |   template:
 21 |     metadata:
 22 |       creationTimestamp: null
 23 |       labels:
 24 |         k8s-app: stackdriver-heapster
 25 |         version: v1.6.1
 26 |     spec:
 27 |       containers:
 28 |       - env:
 29 |         - name: CLUSTER_NAME
 30 |           valueFrom:
 31 |             configMapKeyRef:
 32 |               name: cluster-config
 33 |               key: cluster_name
 34 |         - name: CLUSTER_LOCATION
 35 |           valueFrom:
 36 |             configMapKeyRef:
 37 |               name: cluster-config
 38 |               key: cluster_location
 39 |         - name: GOOGLE_APPLICATION_CREDENTIALS
 40 |           valueFrom:
 41 |             configMapKeyRef:
 42 |               name: google-cloud-config
 43 |               key: credentials_path
 44 |         command:
 45 |         - /heapster
 46 |         - --source=kubernetes.summary_api:https://kubernetes.default?kubeletHttps=true&kubeletPort=10250&insecure=true
 47 |         - --sink=stackdriver:?cluster_name=$(CLUSTER_NAME)&cluster_location=$(CLUSTER_LOCATION)&zone=$(CLUSTER_LOCATION)&use_old_resources=false&use_new_resources=true&min_interval_sec=100&batch_export_timeout_sec=110
 48 |         image: gcr.io/stackdriver-agents/heapster-amd64:v1.6.1
 49 |         imagePullPolicy: Always
 50 |         livenessProbe:
 51 |           failureThreshold: 3
 52 |           httpGet:
 53 |             path: /healthz
 54 |             port: 8082
 55 |             scheme: HTTP
 56 |           initialDelaySeconds: 180
 57 |           periodSeconds: 10
 58 |           successThreshold: 1
 59 |           timeoutSeconds: 5
 60 |         name: heapster
 61 |         resources:
 62 |           limits:
 63 |             cpu: 88m
 64 |             memory: 204Mi
 65 |           requests:
 66 |             cpu: 88m
 67 |             memory: 204Mi
 68 |         terminationMessagePath: /dev/termination-log
 69 |         terminationMessagePolicy: File
 70 |         volumeMounts:
 71 |         - mountPath: /etc/google-cloud/
 72 |           name: google-cloud-config
 73 |       - command:
 74 |         - /pod_nanny
 75 |         - --cpu=80m
 76 |         - --extra-cpu=0.5m
 77 |         - --memory=140Mi
 78 |         - --extra-memory=4Mi
 79 |         - --threshold=5
 80 |         - --deployment=heapster
 81 |         - --container=heapster
 82 |         - --poll-period=300000
 83 |         - --estimator=exponential
 84 |         env:
 85 |         - name: MY_POD_NAME
 86 |           valueFrom:
 87 |             fieldRef:
 88 |               apiVersion: v1
 89 |               fieldPath: metadata.name
 90 |         - name: MY_POD_NAMESPACE
 91 |           valueFrom:
 92 |             fieldRef:
 93 |               apiVersion: v1
 94 |               fieldPath: metadata.namespace
 95 |         image: gcr.io/google_containers/addon-resizer:1.7
 96 |         imagePullPolicy: IfNotPresent
 97 |         name: heapster-nanny
 98 |         resources:
 99 |           limits:
100 |             cpu: 50m
101 |             memory: 112360Ki
102 |           requests:
103 |             cpu: 50m
104 |             memory: 112360Ki
105 |         terminationMessagePath: /dev/termination-log
106 |         terminationMessagePolicy: File
107 |       dnsPolicy: ClusterFirst
108 |       restartPolicy: Always
109 |       schedulerName: default-scheduler
110 |       securityContext: {}
111 |       serviceAccount: heapster
112 |       serviceAccountName: heapster
113 |       terminationGracePeriodSeconds: 30
114 |       volumes:
115 |       - configMap:
116 |           defaultMode: 420
117 |           name: google-cloud-config
118 |         name: google-cloud-config
119 | 
120 | ---
121 | apiVersion: apps/v1
122 | kind: DaemonSet
123 | metadata:
124 |   labels:
125 |     app: stackdriver-logging-agent
126 |   name: stackdriver-logging-agent
127 |   namespace: stackdriver-agents
128 | spec:
129 |   selector:
130 |     matchLabels:
131 |       app: stackdriver-logging-agent
132 |   template:
133 |     metadata:
134 |       labels:
135 |         app: stackdriver-logging-agent
136 |     spec:
137 |       containers:
138 |       - env:
139 |         - name: NODE_NAME
140 |           valueFrom:
141 |             fieldRef:
142 |               apiVersion: v1
143 |               fieldPath: spec.nodeName
144 |         - name: K8S_NODE_NAME
145 |           valueFrom:
146 |             fieldRef:
147 |               apiVersion: v1
148 |               fieldPath: spec.nodeName
149 |         - name: GOOGLE_APPLICATION_CREDENTIALS
150 |           valueFrom:
151 |             configMapKeyRef:
152 |               name: google-cloud-config
153 |               key: credentials_path
154 |         - name: CLUSTER_NAME
155 |           valueFrom:
156 |             configMapKeyRef:
157 |               name: cluster-config
158 |               key: cluster_name
159 |         - name: CLUSTER_LOCATION
160 |           valueFrom:
161 |             configMapKeyRef:
162 |               name: cluster-config
163 |               key: cluster_location
164 |         image: gcr.io/stackdriver-agents/stackdriver-logging-agent:1.10.3
165 |         imagePullPolicy: IfNotPresent
166 |         livenessProbe:
167 |           exec:
168 |             command:
169 |             - /bin/sh
170 |             - -c
171 |             - |
172 |               LIVENESS_THRESHOLD_SECONDS=${LIVENESS_THRESHOLD_SECONDS:-300}; STUCK_THRESHOLD_SECONDS=${LIVENESS_THRESHOLD_SECONDS:-900}; if [ ! -e /var/run/google-fluentd/buffers ]; then
173 |                 exit 1;
174 |               fi; touch -d "${STUCK_THRESHOLD_SECONDS} seconds ago" /tmp/marker-stuck; if [[ -z "$(find /var/run/google-fluentd/buffers -type f -newer /tmp/marker-stuck -print -quit)" ]]; then
175 |                 rm -rf /var/run/google-fluentd/buffers;
176 |                 exit 1;
177 |               fi; touch -d "${LIVENESS_THRESHOLD_SECONDS} seconds ago" /tmp/marker-liveness; if [[ -z "$(find /var/run/google-fluentd/buffers -type f -newer /tmp/marker-liveness -print -quit)" ]]; then
178 |                 exit 1;
179 |               fi;
180 |           failureThreshold: 3
181 |           initialDelaySeconds: 600
182 |           periodSeconds: 60
183 |           successThreshold: 1
184 |           timeoutSeconds: 1
185 |         name: logging-agent
186 |         resources:
187 |           limits:
188 |             cpu: "1"
189 |             memory: 300Mi
190 |           requests:
191 |             cpu: 100m
192 |             memory: 200Mi
193 |         terminationMessagePath: /dev/termination-log
194 |         terminationMessagePolicy: File
195 |         volumeMounts:
196 |         - mountPath: /var/run
197 |           name: varrun
198 |         - mountPath: /var/log
199 |           name: varlog
200 |         - mountPath: /var/lib/docker/containers
201 |           name: varlibdockercontainers
202 |           readOnly: true
203 |         - mountPath: /etc/google-fluentd/google-fluentd.conf
204 |           subPath: google-fluentd.conf
205 |           name: output-config-volume
206 |         - mountPath: /etc/google-fluentd/config.d
207 |           name: input-config-volume
208 |         - mountPath: /etc/google-cloud/
209 |           name: google-cloud-config
210 |       serviceAccount: logging-agent
211 |       serviceAccountName: logging-agent
212 |       dnsPolicy: ClusterFirst
213 |       restartPolicy: Always
214 |       schedulerName: default-scheduler
215 |       securityContext: {}
216 |       tolerations:
217 |       - operator: "Exists"
218 |         effect: "NoExecute"
219 |       - operator: "Exists"
220 |         effect: "NoSchedule"
221 |       volumes:
222 |       - hostPath:
223 |           path: /var/run
224 |           type: ""
225 |         name: varrun
226 |       - hostPath:
227 |           path: /var/log
228 |           type: ""
229 |         name: varlog
230 |       - hostPath:
231 |           path: /var/lib/docker/containers
232 |           type: ""
233 |         name: varlibdockercontainers
234 |       - configMap:
235 |           defaultMode: 420
236 |           name: logging-agent-output-config
237 |         name: output-config-volume
238 |       - configMap:
239 |           defaultMode: 420
240 |           name: logging-agent-input-config
241 |         name: input-config-volume
242 |       - configMap:
243 |           defaultMode: 420
244 |           name: google-cloud-config
245 |         name: google-cloud-config
246 |   updateStrategy:
247 |     rollingUpdate:
248 |       maxUnavailable: 1
249 |     type: RollingUpdate
250 | ---
251 | # Config map for Logging Agent input and corresponding filter plugins.
252 | apiVersion: v1
253 | kind: ConfigMap
254 | metadata:
255 |   name: logging-agent-input-config
256 |   namespace: stackdriver-agents
257 | data:
258 |   1.containers.input.conf: |-
259 |     # This configuration file for Fluentd is used
260 |     # to watch changes to Docker log files that live in the
261 |     # directory /var/lib/docker/containers/ and are symbolically
262 |     # linked to from the /var/log/containers directory using names that capture the
263 |     # pod name and container name. These logs are then submitted to
264 |     # Google Cloud Logging which assumes the installation of the cloud-logging plug-in.
265 |     #
266 |     # Example
267 |     # =======
268 |     # A line in the Docker log file might look like this JSON:
269 |     #
270 |     # {"log":"2014/09/25 21:15:03 Got request with path wombat\\n",
271 |     #  "stream":"stderr",
272 |     #   "time":"2014-09-25T21:15:03.499185026Z"}
273 |     #
274 |     # The original tag is derived from the log file's location.
275 |     # For example a Docker container's logs might be in the directory:
276 |     #  /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b
277 |     # and in the file:
278 |     #  997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log
279 |     # where 997599971ee6... is the Docker ID of the running container.
280 |     # The Kubernetes kubelet makes a symbolic link to this file on the host
281 |     # machine in the /var/log/containers directory which includes the pod name,
282 |     # the namespace name and the Kubernetes container name:
283 |     #    synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log
284 |     #    ->
285 |     #    /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log
286 |     # The /var/log directory on the host is mapped to the /var/log directory in the container
287 |     # running this instance of Fluentd and we end up collecting the file:
288 |     #   /var/log/containers/synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log
289 |     # This results in the tag:
290 |     #  var.log.containers.synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log
291 |     # where 'synthetic-logger-0.25lps-pod' is the pod name, 'default' is the
292 |     # namespace name, 'synth-lgr' is the container name and '997599971ee6..' is
293 |     # the container ID.
294 |     # The record reformer is used to extract pod_name, namespace_name and
295 |     # container_name from the tag and set them in a local_resource_id in the
296 |     # format of:
297 |     # 'k8s_container.<NAMESPACE_NAME>.<POD_NAME>.<CONTAINER_NAME>'.
298 |     # The reformer also changes the tags to 'stderr' or 'stdout' based on the
299 |     # value of 'stream'.
300 |     # local_resource_id is later used by google_cloud plugin to determine the
301 |     # monitored resource to ingest logs against.
302 | 
303 |     # Json Log Example:
304 |     # {"log":"[info:2016-02-16T16:04:05.930-08:00] Some log text here\n","stream":"stdout","time":"2016-02-17T00:04:05.931087621Z"}
305 |     # CRI Log Example:
306 |     # 2016-02-17T00:04:05.931087621Z stdout F [info:2016-02-16T16:04:05.930-08:00] Some log text here
307 |     <source>
308 |       @type tail
309 |       path /var/log/containers/*.log
310 |       pos_file /var/run/google-fluentd/pos-files/gcp-containers.pos
311 |       # Tags at this point are in the format of:
312 |       # reform.var.log.containers.<POD_NAME>_<NAMESPACE_NAME>_<CONTAINER_NAME>-<CONTAINER_ID>.log
313 |       tag reform.*
314 |       read_from_head true
315 |       <parse>
316 |         @type multi_format
317 |         <pattern>
318 |           format json
319 |           time_key time
320 |           time_format %Y-%m-%dT%H:%M:%S.%NZ
321 |         </pattern>
322 |         <pattern>
323 |           format /^(?<time>.+) (?<stream>stdout|stderr) [^ ]* (?<log>.*)$/
324 |           time_format %Y-%m-%dT%H:%M:%S.%N%:z
325 |         </pattern>
326 |       </parse>
327 |     </source>
328 | 
329 |     <filter reform.**>
330 |       @type parser
331 |       format /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<log>.*)/
332 |       reserve_data true
333 |       suppress_parse_error_log true
334 |       emit_invalid_record_to_error false
335 |       key_name log
336 |     </filter>
337 | 
338 |     <filter reform.**>
339 |       # This plugin uses environment variables KUBERNETES_SERVICE_HOST and
340 |       # KUBERNETES_SERVICE_PORT to talk to the API server. These environment
341 |       # variables are added by kubelet automatically.
342 |       @type kubernetes_metadata
343 |       # Interval in seconds to dump cache stats locally in the Fluentd log.
344 |       stats_interval 300
345 |       # TTL in seconds of each cached element.
346 |       cache_ttl 30
347 |       # Skip fetching unused metadata.
348 |       skip_container_metadata true
349 |       skip_master_url true
350 |       skip_namespace_metadata true
351 |     </filter>
352 | 
353 |     <filter reform.**>
354 |       # We have to use record_modifier because only this plugin supports complex
355 |       # logic to modify record the way we need.
356 |       @type record_modifier
357 |       enable_ruby true
358 |       <record>
359 |         # Extract "kubernetes"->"labels" and set them as
360 |         # "logging.googleapis.com/labels". Prefix these labels with
361 |         # "k8s-pod" to distinguish with other labels and avoid
362 |         # label name collision with other types of labels.
363 |         _dummy_ ${if record.is_a?(Hash) && record.has_key?('kubernetes') && record['kubernetes'].has_key?('labels') && record['kubernetes']['labels'].is_a?(Hash); then; record["logging.googleapis.com/labels"] = record['kubernetes']['labels'].map{ |k, v| ["k8s-pod/#{k}", v]}.to_h; end; nil}
364 |       </record>
365 |       # Delete this dummy field and the rest of "kubernetes" and "docker".
366 |       remove_keys _dummy_,kubernetes,docker
367 |     </filter>
368 | 
369 |     <match reform.**>
370 |       @type record_reformer
371 |       enable_ruby true
372 |       <record>
373 |         # Extract local_resource_id from tag for 'k8s_container' monitored
374 |         # resource. The format is:
375 |         # 'k8s_container.<namespace_name>.<pod_name>.<container_name>'.
376 |         "logging.googleapis.com/local_resource_id" ${"k8s_container.#{tag_suffix[4].rpartition('.')[0].split('_')[1]}.#{tag_suffix[4].rpartition('.')[0].split('_')[0]}.#{tag_suffix[4].rpartition('.')[0].split('_')[2].rpartition('-')[0]}"}
377 |         # Rename the field 'log' to a more generic field 'message'. This way the
378 |         # fluent-plugin-google-cloud knows to flatten the field as textPayload
379 |         # instead of jsonPayload after extracting 'time', 'severity' and
380 |         # 'stream' from the record.
381 |         message ${record['log']}
382 |         # If 'severity' is not set, assume stderr is ERROR and stdout is INFO.
383 |         severity ${record['severity'] || if record['stream'] == 'stderr' then 'ERROR' else 'INFO' end}
384 |       </record>
385 |       tag ${if record['stream'] == 'stderr' then 'raw.stderr' else 'raw.stdout' end}
386 |       remove_keys stream,log
387 |     </match>
388 | 
389 |     # Detect exceptions in the log output and forward them as one log entry.
390 |     <match {raw.stderr,raw.stdout}>
391 |       @type detect_exceptions
392 | 
393 |       remove_tag_prefix raw
394 |       message message
395 |       stream "logging.googleapis.com/local_resource_id"
396 |       multiline_flush_interval 5
397 |       max_bytes 500000
398 |       max_lines 1000
399 |     </match>
400 |   2.pods.input.conf: |-
401 |     # This configuration file for Fluentd is used
402 |     # to watch changes to Kubernetes pod log files live in the
403 |     # directory /var/log/pods/NAMESPACE_NAME_UID. The file name
404 |     # is used to capture the pod namespace, name and uid. These
405 |     # logs are then submitted to Google Cloud Logging with a
406 |     # local_resource_id 'k8s_pod.<NAMESPACE_NAME>.<POD_NAME>'
407 |     # which assumes the installation of the cloud-logging plug-in.
408 |     <source>
409 |       @type tail
410 |       path /var/log/pods/*/*.log
411 |       pos_file /var/run/google-fluentd/pos-files/gcp-pods.pos
412 |       # Tags at this point are in the format of:
413 |       # pods.reform.var.log.pods.<POD_NAMESPACE>_<POD_NAME>_<POD_UID>.<FILE_NAME>.log
414 |       tag pods.reform.*
415 |       read_from_head true
416 |       <parse>
417 |         @type none
418 |       </parse>
419 |     </source>
420 |     <match pods.reform.**>
421 |       @type record_reformer
422 |       enable_ruby true
423 |       <record>
424 |         # Extract local_resource_id from tag for 'k8s_pod' monitored
425 |         # resource. The format is:
426 |         # 'k8s_pod.<namespace_name>.<pod_name>'.
427 |         "logging.googleapis.com/local_resource_id" ${"k8s_pod.#{tag_suffix[5].rpartition('.')[0].split('_')[0]}.#{tag_suffix[5].rpartition('.')[0].split('_')[1]}"}
428 |       </record>
429 |       # Use the log file name as the tag. Currently only `gvisor` log is supported.
430 |       tag ${"#{tag_suffix[5].rpartition('.')[0].rpartition('.')[2]}"}
431 |     </match>
432 |   7.system.input.conf: |-
433 |     # Example:
434 |     # Dec 21 23:17:22 gke-foo-1-1-4b5cbd14-node-4eoj startupscript: Finished running startup script /var/run/google.startup.script
435 |     <source>
436 |       @type tail
437 |       format syslog
438 |       path /var/log/startupscript.log
439 |       pos_file /var/run/google-fluentd/pos-files/gcp-startupscript.pos
440 |       tag startupscript
441 |     </source>
442 | 
443 |     # Example:
444 |     # I1118 21:26:53.975789       6 proxier.go:1096] Port "nodePort for kube-system/default-http-backend:http" (:31429/tcp) was open before and is still needed
445 |     <source>
446 |       @type tail
447 |       format multiline
448 |       multiline_flush_interval 5s
449 |       format_firstline /^\w\d{4}/
450 |       format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
451 |       time_format %m%d %H:%M:%S.%N
452 |       path /var/log/kube-proxy.log
453 |       pos_file /var/run/google-fluentd/pos-files/gcp-kube-proxy.pos
454 |       tag kube-proxy
455 |     </source>
456 | 
457 |     # Logs from systemd-journal for interesting services.
458 |     # TODO(random-liu): Keep this for compatibility, remove this after
459 |     # cri container runtime rolls out.
460 |     <source>
461 |       @type systemd
462 |       filters [{ "_SYSTEMD_UNIT": "docker.service" }]
463 |       <storage>
464 |         @type local
465 |         path /var/run/google-fluentd/pos-files/gcp-journald-docker.pos
466 |       </storage>
467 |       read_from_head true
468 |       tag docker
469 |     </source>
470 | 
471 |     <source>
472 |       @type systemd
473 |       filters [{ "_SYSTEMD_UNIT": "containerd.service" }]
474 |       <storage>
475 |         @type local
476 |         path /var/run/google-fluentd/pos-files/gcp-journald-container-runtime.pos
477 |       </storage>
478 |       read_from_head true
479 |       tag container-runtime
480 |     </source>
481 | 
482 |     <source>
483 |       @type systemd
484 |       filters [{ "_SYSTEMD_UNIT": "kubelet.service" }]
485 |       <storage>
486 |         @type local
487 |         path /var/run/google-fluentd/pos-files/gcp-journald-kubelet.pos
488 |       </storage>
489 |       read_from_head true
490 |       tag kubelet
491 |     </source>
492 | 
493 |     # kube-node-installation, kube-node-configuration, and kube-logrotate are
494 |     # oneshots, but it's extremely valuable to have their logs on Stackdriver
495 |     # as they can diagnose critical issues with node startup.
496 |     # See http://cs/cloud-gke-kubernetes/cluster/gce/gci/node.yaml.
497 |     <source>
498 |       @type systemd
499 |       filters [{ "_SYSTEMD_UNIT": "kube-node-installation.service" }]
500 |       <storage>
501 |         @type local
502 |         path /var/run/google-fluentd/pos-files/gcp-journald-kube-node-installation.pos
503 |       </storage>
504 |       read_from_head true
505 |       tag kube-node-installation
506 |     </source>
507 | 
508 |     <source>
509 |       @type systemd
510 |       filters [{ "_SYSTEMD_UNIT": "kube-node-configuration.service" }]
511 |       <storage>
512 |         @type local
513 |         path /var/run/google-fluentd/pos-files/gcp-journald-kube-node-configuration.pos
514 |       </storage>
515 |       read_from_head true
516 |       tag kube-node-configuration
517 |     </source>
518 | 
519 |     <source>
520 |       @type systemd
521 |       filters [{ "_SYSTEMD_UNIT": "kube-logrotate.service" }]
522 |       <storage>
523 |         @type local
524 |         path /var/run/google-fluentd/pos-files/gcp-journald-kube-logrotate.pos
525 |       </storage>
526 |       read_from_head true
527 |       tag kube-logrotate
528 |     </source>
529 | 
530 |     <source>
531 |       @type systemd
532 |       filters [{ "_SYSTEMD_UNIT": "node-problem-detector.service" }]
533 |       <storage>
534 |         @type local
535 |         path /var/run/google-fluentd/pos-files/gcp-journald-node-problem-detector.pos
536 |       </storage>
537 |       read_from_head true
538 |       tag node-problem-detector
539 |     </source>
540 | 
541 |     <source>
542 |       @type systemd
543 |       filters [{ "_SYSTEMD_UNIT": "kube-container-runtime-monitor.service" }]
544 |       <storage>
545 |         @type local
546 |         path /var/run/google-fluentd/pos-files/gcp-journald-kube-container-runtime-monitor.pos
547 |       </storage>
548 |       read_from_head true
549 |       tag kube-container-runtime-monitor
550 |     </source>
551 | 
552 |     <source>
553 |       @type systemd
554 |       filters [{ "_SYSTEMD_UNIT": "kubelet-monitor.service" }]
555 |       <storage>
556 |         @type local
557 |         path /var/run/google-fluentd/pos-files/gcp-journald-kubelet-monitor.pos
558 |       </storage>
559 |       read_from_head true
560 |       tag kubelet-monitor
561 |     </source>
562 | 
563 |     # Whether to include node-journal or not is determined when starting the
564 |     # cluster. It is not changed when the cluster is already running.
565 |     <source>
566 |       @type systemd
567 |       <storage>
568 |         @type local
569 |         path /var/run/google-fluentd/pos-files/gcp-journald.pos
570 |       </storage>
571 |       read_from_head true
572 |       tag node-journal
573 |     </source>
574 | 
575 |     <filter node-journal>
576 |       @type grep
577 |       <exclude>
578 |         key _SYSTEMD_UNIT
579 |         pattern ^(docker|containerd|kubelet|kube-node-installation|kube-node-configuration|kube-logrotate|node-problem-detector|kube-container-runtime-monitor|kubelet-monitor)\.service$
580 |       </exclude>
581 |     </filter>
582 |   5.monitoring.conf: |-
583 |     # This source is used to acquire approximate process start timestamp,process_start
584 |     # which purpose is explained before the corresponding output plugin.
585 |     <source>
586 |       @type exec
587 |       command /bin/sh -c 'date +%s'
588 |       tag process_start
589 |       time_format %Y-%m-%d %H:%M:%S
590 |       keys process_start_timestamp
591 |     </source>
592 | 
593 |     # This filter is used to convert process start timestamp to integer
594 |     # value for correct ingestion in the prometheus output plugin.
595 |     <filter process_start>
596 |       @type record_transformer
597 |       enable_ruby true
598 |       auto_typecast true
599 |       <record>
600 |         process_start_timestamp ${record["process_start_timestamp"].to_i}
601 |       </record>
602 |     </filter>
603 |   6.output.conf: |-
604 |     # This match is placed before the all-matching output to provide metric
605 |     # exporter with a process start timestamp for correct exporting of
606 |     # cumulative metrics to Stackdriver.
607 |     <match process_start>
608 |       @type prometheus
609 | 
610 |       <metric>
611 |         type gauge
612 |         name process_start_time_seconds
613 |         desc Timestamp of the process start in seconds
614 |         key process_start_timestamp
615 |       </metric>
616 |     </match>
617 | 
618 |     # This filter allows to count the number of log entries read by fluentd
619 |     # before they are processed by the output plugin. This in turn allows to
620 |     # monitor the number of log entries that were read but never sent, e.g.
621 |     # because of liveness probe removing buffer.
622 |     <filter **>
623 |       @type prometheus
624 |       <metric>
625 |         type counter
626 |         name logging_entry_count
627 |         desc Total number of log entries generated by either application containers or system components
628 |       </metric>
629 |     </filter>
630 | 
631 |     # This section is exclusive for k8s_container logs. Those come with
632 |     # 'stderr'/'stdout' tags.
633 |     # TODO(instrumentation): Reconsider this workaround later.
634 |     # Trim the entries which exceed slightly less than 100KB, to avoid
635 |     # dropping them. It is a necessity, because Stackdriver only supports
636 |     # entries that are up to 100KB in size.
637 |     <filter {stderr,stdout}>
638 |       @type record_transformer
639 |       enable_ruby true
640 |       <record>
641 |         message ${record['message'].length > 100000 ? "[Trimmed]#{record['message'][0..100000]}..." : record['message']}
642 |       </record>
643 |     </filter>
644 | 
645 |     # Do not collect fluentd's own logs to avoid infinite loops.
646 |     <match fluent.**>
647 |       @type null
648 |     </match>
649 | 
650 |     # Add a unique insertId to each log entry that doesn't already have it.
651 |     # This helps guarantee the order and prevent log duplication.
652 |     <filter **>
653 |       @type add_insert_ids
654 |     </filter>
655 | 
656 |     # This filter parses the 'source' field created for glog lines into a single
657 |     # top-level field, for proper processing by the output plugin.
658 |     # For example, if a record includes:
659 |     #     {"source":"handlers.go:131"},
660 |     # then the following entry will be added to the record:
661 |     #     {"logging.googleapis.com/sourceLocation":
662 |     #          {"file":"handlers.go", "line":"131"}
663 |     #     }
664 |     <filter **>
665 |       @type record_transformer
666 |       enable_ruby true
667 |       <record>
668 |         "logging.googleapis.com/sourceLocation" ${if record.is_a?(Hash) && record.has_key?('source'); source_parts = record['source'].split(':', 2); {'file' => source_parts[0], 'line' => source_parts[1]} if source_parts.length == 2; else; nil; end}
669 |       </record>
670 |     </filter>
671 | 
672 | 
673 |     # This section is exclusive for k8s_container logs. These logs come with
674 |     # 'stderr'/'stdout' tags.
675 |     # We use a separate output stanza for 'k8s_node' logs with a smaller buffer
676 |     # because node logs are less important than user's container logs.
677 |     <match {stderr,stdout}>
678 |       @type google_cloud
679 | 
680 |       # Try to detect JSON formatted log entries.
681 |       detect_json true
682 |       # Collect metrics in Prometheus registry about plugin activity.
683 |       enable_monitoring true
684 |       monitoring_type prometheus
685 |       # Allow log entries from multiple containers to be sent in the same request.
686 |       split_logs_by_tag false
687 |       # Set the buffer type to file to improve the reliability and reduce the memory consumption
688 |       buffer_type file
689 |       buffer_path /var/run/google-fluentd/buffers/kubernetes.containers.buffer
690 |       # Set queue_full action to block because we want to pause gracefully
691 |       # in case of the off-the-limits load instead of throwing an exception
692 |       buffer_queue_full_action block
693 |       # Set the chunk limit conservatively to avoid exceeding the recommended
694 |       # chunk size of 5MB per write request.
695 |       buffer_chunk_limit 512k
696 |       # Cap the combined memory usage of this buffer and the one below to
697 |       # 512KiB/chunk * (6 + 2) chunks = 4 MiB
698 |       buffer_queue_limit 6
699 |       # Never wait more than 5 seconds before flushing logs in the non-error case.
700 |       flush_interval 5s
701 |       # Never wait longer than 30 seconds between retries.
702 |       max_retry_wait 30
703 |       # Disable the limit on the number of retries (retry forever).
704 |       disable_retry_limit
705 |       # Use multiple threads for processing.
706 |       num_threads 2
707 |       use_grpc true
708 |       k8s_cluster_name "#{ENV["CLUSTER_NAME"]}"
709 |       k8s_cluster_location "#{ENV["CLUSTER_LOCATION"]}"
710 |       # Skip timestamp adjustment as this is in a controlled environment with
711 |       # known timestamp format. This helps with CPU usage.
712 |       adjust_invalid_timestamps false
713 |     </match>
714 | 
715 |     # This section is exclusive for 'gvisor' logs. These logs come with tags
716 |     # `gvisor` tags.
717 |     # We use a separate output stanza for 'gvisor' logs with a smaller
718 |     # buffer because user's container application logs are more important.
719 |     <match gvisor>
720 |       @type google_cloud
721 |       detect_json true
722 |       enable_monitoring true
723 |       monitoring_type prometheus
724 |       # Allow log entries from multiple pods to be sent in the same request.
725 |       split_logs_by_tag false
726 |       buffer_type file
727 |       buffer_path /var/run/google-fluentd/buffers/kubernetes.pod.buffer
728 |       buffer_queue_full_action block
729 |       buffer_chunk_limit 512k
730 |       buffer_queue_limit 2
731 |       flush_interval 5s
732 |       max_retry_wait 30
733 |       disable_retry_limit
734 |       num_threads 2
735 |       use_grpc true
736 |       # Skip timestamp adjustment as this is in a controlled environment with
737 |       # known timestamp format. This helps with CPU usage.
738 |       adjust_invalid_timestamps false
739 |     </match>
740 | 
741 |     # Attach local_resource_id for 'k8s_node' monitored resource.
742 |     <filter **>
743 |       @type record_transformer
744 |       enable_ruby true
745 |       <record>
746 |         "logging.googleapis.com/local_resource_id" ${"k8s_node.#{ENV['NODE_NAME']}"}
747 |       </record>
748 |     </filter>
749 | 
750 |     # This section is exclusive for 'k8s_node' logs. These logs come with tags
751 |     # that are neither 'stderr' or 'stdout'.
752 |     # We use a separate output stanza for 'k8s_container' logs with a larger
753 |     # buffer because user's container logs are more important than node logs.
754 |     <match **>
755 |       @type google_cloud
756 | 
757 |       detect_json true
758 |       enable_monitoring true
759 |       monitoring_type prometheus
760 |       # Allow entries from multiple system logs to be sent in the same request.
761 |       split_logs_by_tag false
762 |       detect_subservice false
763 |       buffer_type file
764 |       buffer_path /var/run/google-fluentd/buffers/kubernetes.system.buffer
765 |       buffer_queue_full_action block
766 |       buffer_chunk_limit 512k
767 |       buffer_queue_limit 2
768 |       flush_interval 5s
769 |       max_retry_wait 30
770 |       disable_retry_limit
771 |       num_threads 2
772 |       use_grpc true
773 |       k8s_cluster_name "#{ENV["CLUSTER_NAME"]}"
774 |       k8s_cluster_location "#{ENV["CLUSTER_LOCATION"]}"
775 |       # Skip timestamp adjustment as this is in a controlled environment with
776 |       # known timestamp format. This helps with CPU usage.
777 |       adjust_invalid_timestamps false
778 |     </match>
779 | 
780 |   4.knative.input.conf: |-
781 |     # This configuration file for Fluentd is used to collect the logs located
782 |     # inside the /var/log directory of a Cloud Run on GKE / Knative container.
783 |     # Knative mounts a emptyDir volume named 'knative-var-log' inside the
784 |     # user container and if collection is enabled it creates a symbolic link
785 |     # inside another emptyDir named 'knative-internal' that contains the
786 |     # information needed for Kubernetes metadata enrichment.
787 |     #
788 |     # Concretely, on the host the symbolic link is:
789 |     # /var/lib/kubelet/pods/<POD_ID>/volumes/kubernetes.io~empty-dir/knative-internal/<NAMESPACE_NAME>_<POD_NAME>_<CONTAINER_NAME>
790 |     # ->
791 |     # /var/lib/kubelet/pods/<POD_ID>/volumes/kubernetes.io~empty-dir/knative-var-log
792 |     #
793 |     # The record reformer is used to extract pod_name, namespace_name and
794 |     # container_name from the tag and set them in a local_resource_id in the
795 |     # format of:
796 |     # 'k8s_container.<NAMESPACE_NAME>.<POD_NAME>.<CONTAINER_NAME>'.
797 |     # The reformer also sets the label 'source' to the path of the log file as seen
798 |     # from the user container point of view to be able to identify the source of
799 |     # a log entry.
800 |     <source>
801 |       @type tail
802 |       # **/*/**/* allows path expansion to go through the symbolic link and then recursively through /var/log
803 |       path /var/lib/kubelet/pods/*/volumes/kubernetes.io~empty-dir/knative-internal/**/*/**/*
804 |       # Save the path of the file as field 'source'
805 |       path_key source
806 |       pos_file /var/run/google-fluentd/pos-files/knative-var-log.pos
807 |       # Tags at this point are in the format of:
808 |       # knative.reform.var.lib.kubelet.pods.<POD_ID>.volumes.kubernetes.io~empty-dir.knative-internal.<NAMESPACE_NAME>_<POD_NAME>_<CONTAINER_NAME>.<PATH_INSIDE_VAR_LOG>
809 |       tag knative.reform.*
810 |       read_from_head true
811 |       <parse>
812 |         @type multi_format
813 |         <pattern>
814 |           format json
815 |           time_key time
816 |           time_format %Y-%m-%dT%H:%M:%S.%NZ
817 |         </pattern>
818 |         <pattern>
819 |           format /^(?<time>.+) (?<stream>stdout|stderr) [^ ]* (?<log>.*)$/
820 |           time_format %Y-%m-%dT%H:%M:%S.%N%:z
821 |         </pattern>
822 |         <pattern>
823 |           format none
824 |           message_key log
825 |         </pattern>
826 |       </parse>
827 |     </source>
828 | 
829 |     <filter knative.reform.**>
830 |       @type parser
831 |       format /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<log>.*)/
832 |       reserve_data true
833 |       suppress_parse_error_log true
834 |       emit_invalid_record_to_error false
835 |       key_name log
836 |     </filter>
837 | 
838 |     <filter knative.reform.**>
839 |       # This plugin uses environment variables KUBERNETES_SERVICE_HOST and
840 |       # KUBERNETES_SERVICE_PORT to talk to the API server. These environment
841 |       # variables are added by kubelet automatically.
842 |       @type kubernetes_metadata
843 |       # Interval in seconds to dump cache stats locally in the Fluentd log.
844 |       stats_interval 300
845 |       # TTL in seconds of each cached element.
846 |       cache_ttl 30
847 |       # Custom regex to extract the fields
848 |       tag_to_kubernetes_name_regexp (?<docker_id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})\.volumes.kubernetes\.io~empty-dir\.knative-internal\.(?<namespace>[^_]+)_(?<pod_name>[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<container_name>user-container)\..*?$
849 |     </filter>
850 | 
851 |     <filter knative.reform.**>
852 |       # We have to use record_modifier because only this plugin supports complex
853 |       # logic to modify record the way we need.
854 |       @type record_modifier
855 |       enable_ruby true
856 |       <record>
857 |         # Extract "kubernetes"->"labels" and set them as
858 |         # "logging.googleapis.com/labels". Prefix these labels with
859 |         # "k8s-pod" to distinguish with other labels and avoid
860 |         # label name collision with other types of labels.
861 |         _dummy_ ${if record.is_a?(Hash) && record.has_key?('kubernetes') && record['kubernetes'].has_key?('labels') && record['kubernetes']['labels'].is_a?(Hash); then; record['logging.googleapis.com/labels'] = record['kubernetes']['labels'].map{ |k, v| ["k8s-pod/#{k}", v]}.to_h; end; nil}
862 |       </record>
863 |       # Delete this dummy field and the rest of "kubernetes" and "docker".
864 |       remove_keys _dummy_,kubernetes,docker
865 |     </filter>
866 | 
867 |     <match knative.reform.**>
868 |       @type record_reformer
869 |       enable_ruby true
870 |       <record>
871 |         # Extract local_resource_id from tag for 'k8s_container' monitored
872 |         # resource. The format is:
873 |         # 'k8s_container.<namespace_name>.<pod_name>.<container_name>'.
874 |         "logging.googleapis.com/local_resource_id" ${"k8s_container.#{tag_parts[11].gsub('_', '.')}"}
875 |         # Rename the field 'log' to a more generic field 'message'. This way the
876 |         # fluent-plugin-google-cloud knows to flatten the field as textPayload
877 |         # instead of jsonPayload after extracting 'time', 'severity' and
878 |         # 'stream' from the record.
879 |         message ${record['log']}
880 |         # If 'severity' is not set, assume it is INFO.
881 |         severity ${record['severity'] || 'INFO'}
882 |         # Set 'source' label to the path of the log file as seen from the Knative container point of view
883 |         _dummy_ ${record["logging.googleapis.com/labels"]["source"] = "/var/log/" + record["source"].scan(/\/knative-internal\/[^\/]+\/(.*)/).last.last}
884 |       </record>
885 |       tag knative.stdout
886 |       remove_keys _dummy_,log,source
887 |     </match>
888 | 
889 |     # Detect exceptions in the log output and forward them as one log entry.
890 |     <match knative.stdout>
891 |       @type detect_exceptions
892 |       remove_tag_prefix knative
893 |       message message
894 |       stream "logging.googleapis.com/local_resource_id"
895 |       multiline_flush_interval 5
896 |       max_bytes 500000
897 |       max_lines 1000
898 |     </match>
899 | ---
900 | # Config map for Logging Agent output and corresponding filter plugins.
901 | apiVersion: v1
902 | kind: ConfigMap
903 | metadata:
904 |   name: logging-agent-output-config
905 |   namespace: stackdriver-agents
906 | data:
907 |   google-fluentd.conf: |-
908 |     @include config.d/*.conf
909 | 
910 | ---
911 | apiVersion: apps/v1
912 | kind: Deployment
913 | metadata:
914 |   labels:
915 |     app: stackdriver-metadata-agent
916 |     cluster-level: "true"
917 |   name: stackdriver-metadata-agent-cluster-level
918 |   namespace: stackdriver-agents
919 | spec:
920 |   replicas: 1
921 |   selector:
922 |     matchLabels:
923 |       app: stackdriver-metadata-agent
924 |       cluster-level: "true"
925 |   template:
926 |     metadata:
927 |       labels:
928 |         app: stackdriver-metadata-agent
929 |         cluster-level: "true"
930 |     spec:
931 |       containers:
932 |       - env:
933 |         - name: CLUSTER_NAME
934 |           valueFrom:
935 |             configMapKeyRef:
936 |               name: cluster-config
937 |               key: cluster_name
938 |         - name: CLUSTER_LOCATION
939 |           valueFrom:
940 |             configMapKeyRef:
941 |               name: cluster-config
942 |               key: cluster_location
943 |         - name: GOOGLE_APPLICATION_CREDENTIALS
944 |           valueFrom:
945 |             configMapKeyRef:
946 |               name: google-cloud-config
947 |               key: credentials_path
948 |         - name: PROMETHEUS_PORT
949 |           value: "8888"
950 |         args:
951 |         - -logtostderr
952 |         - -v=1
953 |         image: gcr.io/stackdriver-agents/metadata-agent-go:1.2.1
954 |         imagePullPolicy: IfNotPresent
955 |         name: metadata-agent
956 |         resources:
957 |           requests:
958 |             cpu: 40m
959 |             memory: 50Mi
960 |         ports:
961 |         - name: metadata-agent
962 |           containerPort: 8888
963 |         terminationMessagePath: /dev/termination-log
964 |         terminationMessagePolicy: File
965 |         volumeMounts:
966 |         - mountPath: /etc/google-cloud/
967 |           name: google-cloud-config
968 |         - mountPath: /etc/ssl/certs
969 |           name: ssl-certs
970 |       dnsPolicy: ClusterFirst
971 |       restartPolicy: Always
972 |       schedulerName: default-scheduler
973 |       securityContext: {}
974 |       serviceAccount: metadata-agent
975 |       serviceAccountName: metadata-agent
976 |       tolerations:
977 |       - operator: "Exists"
978 |         effect: "NoExecute"
979 |       - operator: "Exists"
980 |         effect: "NoSchedule"
981 |       terminationGracePeriodSeconds: 5
982 |       volumes:
983 |       - configMap:
984 |           defaultMode: 420
985 |           name: google-cloud-config
986 |         name: google-cloud-config
987 |       - hostPath:
988 |           path: /etc/ssl/certs
989 |           type: Directory
990 |         name: ssl-certs
991 |   strategy:
992 |     rollingUpdate:
993 |       maxUnavailable: 1
994 |     type: RollingUpdate
995 | 
996 | ---
997 | 


--------------------------------------------------------------------------------
/compile-agents-yaml.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | #
 3 | # Copyright 2018 Google LLC
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | echo "# THIS FILE IS AUTO-GENERATED DO NOT EDIT" > agents.yaml
18 | 
19 | GLOBIGNORE="rbac-setup.yaml:agents.yaml"
20 | for f in *.yaml; do
21 |   cat $f
22 |   echo -e "\n---"
23 | done >> agents.yaml
24 | 
25 | echo "agents.yaml has been generated for you."
26 | 
27 | git add agents.yaml
28 | 
29 | 


--------------------------------------------------------------------------------
/heapster.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: apps/v1
  2 | kind: Deployment
  3 | metadata:
  4 |   labels:
  5 |     k8s-app: stackdriver-heapster
  6 |     version: v1.6.1
  7 |   name: heapster
  8 |   namespace: stackdriver-agents
  9 | spec:
 10 |   replicas: 1
 11 |   selector:
 12 |     matchLabels:
 13 |       k8s-app: stackdriver-heapster
 14 |   strategy:
 15 |     rollingUpdate:
 16 |       maxSurge: 1
 17 |       maxUnavailable: 1
 18 |     type: RollingUpdate
 19 |   template:
 20 |     metadata:
 21 |       creationTimestamp: null
 22 |       labels:
 23 |         k8s-app: stackdriver-heapster
 24 |         version: v1.6.1
 25 |     spec:
 26 |       containers:
 27 |       - env:
 28 |         - name: CLUSTER_NAME
 29 |           valueFrom:
 30 |             configMapKeyRef:
 31 |               name: cluster-config
 32 |               key: cluster_name
 33 |         - name: CLUSTER_LOCATION
 34 |           valueFrom:
 35 |             configMapKeyRef:
 36 |               name: cluster-config
 37 |               key: cluster_location
 38 |         - name: GOOGLE_APPLICATION_CREDENTIALS
 39 |           valueFrom:
 40 |             configMapKeyRef:
 41 |               name: google-cloud-config
 42 |               key: credentials_path
 43 |         command:
 44 |         - /heapster
 45 |         - --source=kubernetes.summary_api:https://kubernetes.default?kubeletHttps=true&kubeletPort=10250&insecure=true
 46 |         - --sink=stackdriver:?cluster_name=$(CLUSTER_NAME)&cluster_location=$(CLUSTER_LOCATION)&zone=$(CLUSTER_LOCATION)&use_old_resources=false&use_new_resources=true&min_interval_sec=100&batch_export_timeout_sec=110
 47 |         image: gcr.io/stackdriver-agents/heapster-amd64:v1.6.1
 48 |         imagePullPolicy: Always
 49 |         livenessProbe:
 50 |           failureThreshold: 3
 51 |           httpGet:
 52 |             path: /healthz
 53 |             port: 8082
 54 |             scheme: HTTP
 55 |           initialDelaySeconds: 180
 56 |           periodSeconds: 10
 57 |           successThreshold: 1
 58 |           timeoutSeconds: 5
 59 |         name: heapster
 60 |         resources:
 61 |           limits:
 62 |             cpu: 88m
 63 |             memory: 204Mi
 64 |           requests:
 65 |             cpu: 88m
 66 |             memory: 204Mi
 67 |         terminationMessagePath: /dev/termination-log
 68 |         terminationMessagePolicy: File
 69 |         volumeMounts:
 70 |         - mountPath: /etc/google-cloud/
 71 |           name: google-cloud-config
 72 |       - command:
 73 |         - /pod_nanny
 74 |         - --cpu=80m
 75 |         - --extra-cpu=0.5m
 76 |         - --memory=140Mi
 77 |         - --extra-memory=4Mi
 78 |         - --threshold=5
 79 |         - --deployment=heapster
 80 |         - --container=heapster
 81 |         - --poll-period=300000
 82 |         - --estimator=exponential
 83 |         env:
 84 |         - name: MY_POD_NAME
 85 |           valueFrom:
 86 |             fieldRef:
 87 |               apiVersion: v1
 88 |               fieldPath: metadata.name
 89 |         - name: MY_POD_NAMESPACE
 90 |           valueFrom:
 91 |             fieldRef:
 92 |               apiVersion: v1
 93 |               fieldPath: metadata.namespace
 94 |         image: gcr.io/google_containers/addon-resizer:1.7
 95 |         imagePullPolicy: IfNotPresent
 96 |         name: heapster-nanny
 97 |         resources:
 98 |           limits:
 99 |             cpu: 50m
100 |             memory: 112360Ki
101 |           requests:
102 |             cpu: 50m
103 |             memory: 112360Ki
104 |         terminationMessagePath: /dev/termination-log
105 |         terminationMessagePolicy: File
106 |       dnsPolicy: ClusterFirst
107 |       restartPolicy: Always
108 |       schedulerName: default-scheduler
109 |       securityContext: {}
110 |       serviceAccount: heapster
111 |       serviceAccountName: heapster
112 |       terminationGracePeriodSeconds: 30
113 |       volumes:
114 |       - configMap:
115 |           defaultMode: 420
116 |           name: google-cloud-config
117 |         name: google-cloud-config
118 | 


--------------------------------------------------------------------------------
/hooks/pre-commit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ./compile-agents-yaml.sh
4 | 
5 | 


--------------------------------------------------------------------------------
/logging-agent.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: apps/v1
  2 | kind: DaemonSet
  3 | metadata:
  4 |   labels:
  5 |     app: stackdriver-logging-agent
  6 |   name: stackdriver-logging-agent
  7 |   namespace: stackdriver-agents
  8 | spec:
  9 |   selector:
 10 |     matchLabels:
 11 |       app: stackdriver-logging-agent
 12 |   template:
 13 |     metadata:
 14 |       labels:
 15 |         app: stackdriver-logging-agent
 16 |     spec:
 17 |       containers:
 18 |       - env:
 19 |         - name: NODE_NAME
 20 |           valueFrom:
 21 |             fieldRef:
 22 |               apiVersion: v1
 23 |               fieldPath: spec.nodeName
 24 |         - name: K8S_NODE_NAME
 25 |           valueFrom:
 26 |             fieldRef:
 27 |               apiVersion: v1
 28 |               fieldPath: spec.nodeName
 29 |         - name: GOOGLE_APPLICATION_CREDENTIALS
 30 |           valueFrom:
 31 |             configMapKeyRef:
 32 |               name: google-cloud-config
 33 |               key: credentials_path
 34 |         - name: CLUSTER_NAME
 35 |           valueFrom:
 36 |             configMapKeyRef:
 37 |               name: cluster-config
 38 |               key: cluster_name
 39 |         - name: CLUSTER_LOCATION
 40 |           valueFrom:
 41 |             configMapKeyRef:
 42 |               name: cluster-config
 43 |               key: cluster_location
 44 |         image: gcr.io/stackdriver-agents/stackdriver-logging-agent:1.10.3
 45 |         imagePullPolicy: IfNotPresent
 46 |         livenessProbe:
 47 |           exec:
 48 |             command:
 49 |             - /bin/sh
 50 |             - -c
 51 |             - |
 52 |               LIVENESS_THRESHOLD_SECONDS=${LIVENESS_THRESHOLD_SECONDS:-300}; STUCK_THRESHOLD_SECONDS=${LIVENESS_THRESHOLD_SECONDS:-900}; if [ ! -e /var/run/google-fluentd/buffers ]; then
 53 |                 exit 1;
 54 |               fi; touch -d "${STUCK_THRESHOLD_SECONDS} seconds ago" /tmp/marker-stuck; if [[ -z "$(find /var/run/google-fluentd/buffers -type f -newer /tmp/marker-stuck -print -quit)" ]]; then
 55 |                 rm -rf /var/run/google-fluentd/buffers;
 56 |                 exit 1;
 57 |               fi; touch -d "${LIVENESS_THRESHOLD_SECONDS} seconds ago" /tmp/marker-liveness; if [[ -z "$(find /var/run/google-fluentd/buffers -type f -newer /tmp/marker-liveness -print -quit)" ]]; then
 58 |                 exit 1;
 59 |               fi;
 60 |           failureThreshold: 3
 61 |           initialDelaySeconds: 600
 62 |           periodSeconds: 60
 63 |           successThreshold: 1
 64 |           timeoutSeconds: 1
 65 |         name: logging-agent
 66 |         resources:
 67 |           limits:
 68 |             cpu: "1"
 69 |             memory: 300Mi
 70 |           requests:
 71 |             cpu: 100m
 72 |             memory: 200Mi
 73 |         terminationMessagePath: /dev/termination-log
 74 |         terminationMessagePolicy: File
 75 |         volumeMounts:
 76 |         - mountPath: /var/run
 77 |           name: varrun
 78 |         - mountPath: /var/log
 79 |           name: varlog
 80 |         - mountPath: /var/lib/docker/containers
 81 |           name: varlibdockercontainers
 82 |           readOnly: true
 83 |         - mountPath: /etc/google-fluentd/google-fluentd.conf
 84 |           subPath: google-fluentd.conf
 85 |           name: output-config-volume
 86 |         - mountPath: /etc/google-fluentd/config.d
 87 |           name: input-config-volume
 88 |         - mountPath: /etc/google-cloud/
 89 |           name: google-cloud-config
 90 |       serviceAccount: logging-agent
 91 |       serviceAccountName: logging-agent
 92 |       dnsPolicy: ClusterFirst
 93 |       restartPolicy: Always
 94 |       schedulerName: default-scheduler
 95 |       securityContext: {}
 96 |       tolerations:
 97 |       - operator: "Exists"
 98 |         effect: "NoExecute"
 99 |       - operator: "Exists"
100 |         effect: "NoSchedule"
101 |       volumes:
102 |       - hostPath:
103 |           path: /var/run
104 |           type: ""
105 |         name: varrun
106 |       - hostPath:
107 |           path: /var/log
108 |           type: ""
109 |         name: varlog
110 |       - hostPath:
111 |           path: /var/lib/docker/containers
112 |           type: ""
113 |         name: varlibdockercontainers
114 |       - configMap:
115 |           defaultMode: 420
116 |           name: logging-agent-output-config
117 |         name: output-config-volume
118 |       - configMap:
119 |           defaultMode: 420
120 |           name: logging-agent-input-config
121 |         name: input-config-volume
122 |       - configMap:
123 |           defaultMode: 420
124 |           name: google-cloud-config
125 |         name: google-cloud-config
126 |   updateStrategy:
127 |     rollingUpdate:
128 |       maxUnavailable: 1
129 |     type: RollingUpdate
130 | ---
131 | # Config map for Logging Agent input and corresponding filter plugins.
132 | apiVersion: v1
133 | kind: ConfigMap
134 | metadata:
135 |   name: logging-agent-input-config
136 |   namespace: stackdriver-agents
137 | data:
138 |   1.containers.input.conf: |-
139 |     # This configuration file for Fluentd is used
140 |     # to watch changes to Docker log files that live in the
141 |     # directory /var/lib/docker/containers/ and are symbolically
142 |     # linked to from the /var/log/containers directory using names that capture the
143 |     # pod name and container name. These logs are then submitted to
144 |     # Google Cloud Logging which assumes the installation of the cloud-logging plug-in.
145 |     #
146 |     # Example
147 |     # =======
148 |     # A line in the Docker log file might look like this JSON:
149 |     #
150 |     # {"log":"2014/09/25 21:15:03 Got request with path wombat\\n",
151 |     #  "stream":"stderr",
152 |     #   "time":"2014-09-25T21:15:03.499185026Z"}
153 |     #
154 |     # The original tag is derived from the log file's location.
155 |     # For example a Docker container's logs might be in the directory:
156 |     #  /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b
157 |     # and in the file:
158 |     #  997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log
159 |     # where 997599971ee6... is the Docker ID of the running container.
160 |     # The Kubernetes kubelet makes a symbolic link to this file on the host
161 |     # machine in the /var/log/containers directory which includes the pod name,
162 |     # the namespace name and the Kubernetes container name:
163 |     #    synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log
164 |     #    ->
165 |     #    /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log
166 |     # The /var/log directory on the host is mapped to the /var/log directory in the container
167 |     # running this instance of Fluentd and we end up collecting the file:
168 |     #   /var/log/containers/synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log
169 |     # This results in the tag:
170 |     #  var.log.containers.synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log
171 |     # where 'synthetic-logger-0.25lps-pod' is the pod name, 'default' is the
172 |     # namespace name, 'synth-lgr' is the container name and '997599971ee6..' is
173 |     # the container ID.
174 |     # The record reformer is used to extract pod_name, namespace_name and
175 |     # container_name from the tag and set them in a local_resource_id in the
176 |     # format of:
177 |     # 'k8s_container.<NAMESPACE_NAME>.<POD_NAME>.<CONTAINER_NAME>'.
178 |     # The reformer also changes the tags to 'stderr' or 'stdout' based on the
179 |     # value of 'stream'.
180 |     # local_resource_id is later used by google_cloud plugin to determine the
181 |     # monitored resource to ingest logs against.
182 | 
183 |     # Json Log Example:
184 |     # {"log":"[info:2016-02-16T16:04:05.930-08:00] Some log text here\n","stream":"stdout","time":"2016-02-17T00:04:05.931087621Z"}
185 |     # CRI Log Example:
186 |     # 2016-02-17T00:04:05.931087621Z stdout F [info:2016-02-16T16:04:05.930-08:00] Some log text here
187 |     <source>
188 |       @type tail
189 |       path /var/log/containers/*.log
190 |       pos_file /var/run/google-fluentd/pos-files/gcp-containers.pos
191 |       # Tags at this point are in the format of:
192 |       # reform.var.log.containers.<POD_NAME>_<NAMESPACE_NAME>_<CONTAINER_NAME>-<CONTAINER_ID>.log
193 |       tag reform.*
194 |       read_from_head true
195 |       <parse>
196 |         @type multi_format
197 |         <pattern>
198 |           format json
199 |           time_key time
200 |           time_format %Y-%m-%dT%H:%M:%S.%NZ
201 |         </pattern>
202 |         <pattern>
203 |           format /^(?<time>.+) (?<stream>stdout|stderr) [^ ]* (?<log>.*)$/
204 |           time_format %Y-%m-%dT%H:%M:%S.%N%:z
205 |         </pattern>
206 |       </parse>
207 |     </source>
208 | 
209 |     <filter reform.**>
210 |       @type parser
211 |       format /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<log>.*)/
212 |       reserve_data true
213 |       suppress_parse_error_log true
214 |       emit_invalid_record_to_error false
215 |       key_name log
216 |     </filter>
217 | 
218 |     <filter reform.**>
219 |       # This plugin uses environment variables KUBERNETES_SERVICE_HOST and
220 |       # KUBERNETES_SERVICE_PORT to talk to the API server. These environment
221 |       # variables are added by kubelet automatically.
222 |       @type kubernetes_metadata
223 |       # Interval in seconds to dump cache stats locally in the Fluentd log.
224 |       stats_interval 300
225 |       # TTL in seconds of each cached element.
226 |       cache_ttl 30
227 |       # Skip fetching unused metadata.
228 |       skip_container_metadata true
229 |       skip_master_url true
230 |       skip_namespace_metadata true
231 |     </filter>
232 | 
233 |     <filter reform.**>
234 |       # We have to use record_modifier because only this plugin supports complex
235 |       # logic to modify record the way we need.
236 |       @type record_modifier
237 |       enable_ruby true
238 |       <record>
239 |         # Extract "kubernetes"->"labels" and set them as
240 |         # "logging.googleapis.com/labels". Prefix these labels with
241 |         # "k8s-pod" to distinguish with other labels and avoid
242 |         # label name collision with other types of labels.
243 |         _dummy_ ${if record.is_a?(Hash) && record.has_key?('kubernetes') && record['kubernetes'].has_key?('labels') && record['kubernetes']['labels'].is_a?(Hash); then; record["logging.googleapis.com/labels"] = record['kubernetes']['labels'].map{ |k, v| ["k8s-pod/#{k}", v]}.to_h; end; nil}
244 |       </record>
245 |       # Delete this dummy field and the rest of "kubernetes" and "docker".
246 |       remove_keys _dummy_,kubernetes,docker
247 |     </filter>
248 | 
249 |     <match reform.**>
250 |       @type record_reformer
251 |       enable_ruby true
252 |       <record>
253 |         # Extract local_resource_id from tag for 'k8s_container' monitored
254 |         # resource. The format is:
255 |         # 'k8s_container.<namespace_name>.<pod_name>.<container_name>'.
256 |         "logging.googleapis.com/local_resource_id" ${"k8s_container.#{tag_suffix[4].rpartition('.')[0].split('_')[1]}.#{tag_suffix[4].rpartition('.')[0].split('_')[0]}.#{tag_suffix[4].rpartition('.')[0].split('_')[2].rpartition('-')[0]}"}
257 |         # Rename the field 'log' to a more generic field 'message'. This way the
258 |         # fluent-plugin-google-cloud knows to flatten the field as textPayload
259 |         # instead of jsonPayload after extracting 'time', 'severity' and
260 |         # 'stream' from the record.
261 |         message ${record['log']}
262 |         # If 'severity' is not set, assume stderr is ERROR and stdout is INFO.
263 |         severity ${record['severity'] || if record['stream'] == 'stderr' then 'ERROR' else 'INFO' end}
264 |       </record>
265 |       tag ${if record['stream'] == 'stderr' then 'raw.stderr' else 'raw.stdout' end}
266 |       remove_keys stream,log
267 |     </match>
268 | 
269 |     # Detect exceptions in the log output and forward them as one log entry.
270 |     <match {raw.stderr,raw.stdout}>
271 |       @type detect_exceptions
272 | 
273 |       remove_tag_prefix raw
274 |       message message
275 |       stream "logging.googleapis.com/local_resource_id"
276 |       multiline_flush_interval 5
277 |       max_bytes 500000
278 |       max_lines 1000
279 |     </match>
280 |   2.pods.input.conf: |-
281 |     # This configuration file for Fluentd is used
282 |     # to watch changes to Kubernetes pod log files live in the
283 |     # directory /var/log/pods/NAMESPACE_NAME_UID. The file name
284 |     # is used to capture the pod namespace, name and uid. These
285 |     # logs are then submitted to Google Cloud Logging with a
286 |     # local_resource_id 'k8s_pod.<NAMESPACE_NAME>.<POD_NAME>'
287 |     # which assumes the installation of the cloud-logging plug-in.
288 |     <source>
289 |       @type tail
290 |       path /var/log/pods/*/*.log
291 |       pos_file /var/run/google-fluentd/pos-files/gcp-pods.pos
292 |       # Tags at this point are in the format of:
293 |       # pods.reform.var.log.pods.<POD_NAMESPACE>_<POD_NAME>_<POD_UID>.<FILE_NAME>.log
294 |       tag pods.reform.*
295 |       read_from_head true
296 |       <parse>
297 |         @type none
298 |       </parse>
299 |     </source>
300 |     <match pods.reform.**>
301 |       @type record_reformer
302 |       enable_ruby true
303 |       <record>
304 |         # Extract local_resource_id from tag for 'k8s_pod' monitored
305 |         # resource. The format is:
306 |         # 'k8s_pod.<namespace_name>.<pod_name>'.
307 |         "logging.googleapis.com/local_resource_id" ${"k8s_pod.#{tag_suffix[5].rpartition('.')[0].split('_')[0]}.#{tag_suffix[5].rpartition('.')[0].split('_')[1]}"}
308 |       </record>
309 |       # Use the log file name as the tag. Currently only `gvisor` log is supported.
310 |       tag ${"#{tag_suffix[5].rpartition('.')[0].rpartition('.')[2]}"}
311 |     </match>
312 |   7.system.input.conf: |-
313 |     # Example:
314 |     # Dec 21 23:17:22 gke-foo-1-1-4b5cbd14-node-4eoj startupscript: Finished running startup script /var/run/google.startup.script
315 |     <source>
316 |       @type tail
317 |       format syslog
318 |       path /var/log/startupscript.log
319 |       pos_file /var/run/google-fluentd/pos-files/gcp-startupscript.pos
320 |       tag startupscript
321 |     </source>
322 | 
323 |     # Example:
324 |     # I1118 21:26:53.975789       6 proxier.go:1096] Port "nodePort for kube-system/default-http-backend:http" (:31429/tcp) was open before and is still needed
325 |     <source>
326 |       @type tail
327 |       format multiline
328 |       multiline_flush_interval 5s
329 |       format_firstline /^\w\d{4}/
330 |       format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
331 |       time_format %m%d %H:%M:%S.%N
332 |       path /var/log/kube-proxy.log
333 |       pos_file /var/run/google-fluentd/pos-files/gcp-kube-proxy.pos
334 |       tag kube-proxy
335 |     </source>
336 | 
337 |     # Logs from systemd-journal for interesting services.
338 |     # TODO(random-liu): Keep this for compatibility, remove this after
339 |     # cri container runtime rolls out.
340 |     <source>
341 |       @type systemd
342 |       filters [{ "_SYSTEMD_UNIT": "docker.service" }]
343 |       <storage>
344 |         @type local
345 |         path /var/run/google-fluentd/pos-files/gcp-journald-docker.pos
346 |       </storage>
347 |       read_from_head true
348 |       tag docker
349 |     </source>
350 | 
351 |     <source>
352 |       @type systemd
353 |       filters [{ "_SYSTEMD_UNIT": "containerd.service" }]
354 |       <storage>
355 |         @type local
356 |         path /var/run/google-fluentd/pos-files/gcp-journald-container-runtime.pos
357 |       </storage>
358 |       read_from_head true
359 |       tag container-runtime
360 |     </source>
361 | 
362 |     <source>
363 |       @type systemd
364 |       filters [{ "_SYSTEMD_UNIT": "kubelet.service" }]
365 |       <storage>
366 |         @type local
367 |         path /var/run/google-fluentd/pos-files/gcp-journald-kubelet.pos
368 |       </storage>
369 |       read_from_head true
370 |       tag kubelet
371 |     </source>
372 | 
373 |     # kube-node-installation, kube-node-configuration, and kube-logrotate are
374 |     # oneshots, but it's extremely valuable to have their logs on Stackdriver
375 |     # as they can diagnose critical issues with node startup.
376 |     # See http://cs/cloud-gke-kubernetes/cluster/gce/gci/node.yaml.
377 |     <source>
378 |       @type systemd
379 |       filters [{ "_SYSTEMD_UNIT": "kube-node-installation.service" }]
380 |       <storage>
381 |         @type local
382 |         path /var/run/google-fluentd/pos-files/gcp-journald-kube-node-installation.pos
383 |       </storage>
384 |       read_from_head true
385 |       tag kube-node-installation
386 |     </source>
387 | 
388 |     <source>
389 |       @type systemd
390 |       filters [{ "_SYSTEMD_UNIT": "kube-node-configuration.service" }]
391 |       <storage>
392 |         @type local
393 |         path /var/run/google-fluentd/pos-files/gcp-journald-kube-node-configuration.pos
394 |       </storage>
395 |       read_from_head true
396 |       tag kube-node-configuration
397 |     </source>
398 | 
399 |     <source>
400 |       @type systemd
401 |       filters [{ "_SYSTEMD_UNIT": "kube-logrotate.service" }]
402 |       <storage>
403 |         @type local
404 |         path /var/run/google-fluentd/pos-files/gcp-journald-kube-logrotate.pos
405 |       </storage>
406 |       read_from_head true
407 |       tag kube-logrotate
408 |     </source>
409 | 
410 |     <source>
411 |       @type systemd
412 |       filters [{ "_SYSTEMD_UNIT": "node-problem-detector.service" }]
413 |       <storage>
414 |         @type local
415 |         path /var/run/google-fluentd/pos-files/gcp-journald-node-problem-detector.pos
416 |       </storage>
417 |       read_from_head true
418 |       tag node-problem-detector
419 |     </source>
420 | 
421 |     <source>
422 |       @type systemd
423 |       filters [{ "_SYSTEMD_UNIT": "kube-container-runtime-monitor.service" }]
424 |       <storage>
425 |         @type local
426 |         path /var/run/google-fluentd/pos-files/gcp-journald-kube-container-runtime-monitor.pos
427 |       </storage>
428 |       read_from_head true
429 |       tag kube-container-runtime-monitor
430 |     </source>
431 | 
432 |     <source>
433 |       @type systemd
434 |       filters [{ "_SYSTEMD_UNIT": "kubelet-monitor.service" }]
435 |       <storage>
436 |         @type local
437 |         path /var/run/google-fluentd/pos-files/gcp-journald-kubelet-monitor.pos
438 |       </storage>
439 |       read_from_head true
440 |       tag kubelet-monitor
441 |     </source>
442 | 
443 |     # Whether to include node-journal or not is determined when starting the
444 |     # cluster. It is not changed when the cluster is already running.
445 |     <source>
446 |       @type systemd
447 |       <storage>
448 |         @type local
449 |         path /var/run/google-fluentd/pos-files/gcp-journald.pos
450 |       </storage>
451 |       read_from_head true
452 |       tag node-journal
453 |     </source>
454 | 
455 |     <filter node-journal>
456 |       @type grep
457 |       <exclude>
458 |         key _SYSTEMD_UNIT
459 |         pattern ^(docker|containerd|kubelet|kube-node-installation|kube-node-configuration|kube-logrotate|node-problem-detector|kube-container-runtime-monitor|kubelet-monitor)\.service$
460 |       </exclude>
461 |     </filter>
462 |   5.monitoring.conf: |-
463 |     # This source is used to acquire approximate process start timestamp,process_start
464 |     # which purpose is explained before the corresponding output plugin.
465 |     <source>
466 |       @type exec
467 |       command /bin/sh -c 'date +%s'
468 |       tag process_start
469 |       time_format %Y-%m-%d %H:%M:%S
470 |       keys process_start_timestamp
471 |     </source>
472 | 
473 |     # This filter is used to convert process start timestamp to integer
474 |     # value for correct ingestion in the prometheus output plugin.
475 |     <filter process_start>
476 |       @type record_transformer
477 |       enable_ruby true
478 |       auto_typecast true
479 |       <record>
480 |         process_start_timestamp ${record["process_start_timestamp"].to_i}
481 |       </record>
482 |     </filter>
483 |   6.output.conf: |-
484 |     # This match is placed before the all-matching output to provide metric
485 |     # exporter with a process start timestamp for correct exporting of
486 |     # cumulative metrics to Stackdriver.
487 |     <match process_start>
488 |       @type prometheus
489 | 
490 |       <metric>
491 |         type gauge
492 |         name process_start_time_seconds
493 |         desc Timestamp of the process start in seconds
494 |         key process_start_timestamp
495 |       </metric>
496 |     </match>
497 | 
498 |     # This filter allows to count the number of log entries read by fluentd
499 |     # before they are processed by the output plugin. This in turn allows to
500 |     # monitor the number of log entries that were read but never sent, e.g.
501 |     # because of liveness probe removing buffer.
502 |     <filter **>
503 |       @type prometheus
504 |       <metric>
505 |         type counter
506 |         name logging_entry_count
507 |         desc Total number of log entries generated by either application containers or system components
508 |       </metric>
509 |     </filter>
510 | 
511 |     # This section is exclusive for k8s_container logs. Those come with
512 |     # 'stderr'/'stdout' tags.
513 |     # TODO(instrumentation): Reconsider this workaround later.
514 |     # Trim the entries which exceed slightly less than 100KB, to avoid
515 |     # dropping them. It is a necessity, because Stackdriver only supports
516 |     # entries that are up to 100KB in size.
517 |     <filter {stderr,stdout}>
518 |       @type record_transformer
519 |       enable_ruby true
520 |       <record>
521 |         message ${record['message'].length > 100000 ? "[Trimmed]#{record['message'][0..100000]}..." : record['message']}
522 |       </record>
523 |     </filter>
524 | 
525 |     # Do not collect fluentd's own logs to avoid infinite loops.
526 |     <match fluent.**>
527 |       @type null
528 |     </match>
529 | 
530 |     # Add a unique insertId to each log entry that doesn't already have it.
531 |     # This helps guarantee the order and prevent log duplication.
532 |     <filter **>
533 |       @type add_insert_ids
534 |     </filter>
535 | 
536 |     # This filter parses the 'source' field created for glog lines into a single
537 |     # top-level field, for proper processing by the output plugin.
538 |     # For example, if a record includes:
539 |     #     {"source":"handlers.go:131"},
540 |     # then the following entry will be added to the record:
541 |     #     {"logging.googleapis.com/sourceLocation":
542 |     #          {"file":"handlers.go", "line":"131"}
543 |     #     }
544 |     <filter **>
545 |       @type record_transformer
546 |       enable_ruby true
547 |       <record>
548 |         "logging.googleapis.com/sourceLocation" ${if record.is_a?(Hash) && record.has_key?('source'); source_parts = record['source'].split(':', 2); {'file' => source_parts[0], 'line' => source_parts[1]} if source_parts.length == 2; else; nil; end}
549 |       </record>
550 |     </filter>
551 | 
552 | 
553 |     # This section is exclusive for k8s_container logs. These logs come with
554 |     # 'stderr'/'stdout' tags.
555 |     # We use a separate output stanza for 'k8s_node' logs with a smaller buffer
556 |     # because node logs are less important than user's container logs.
557 |     <match {stderr,stdout}>
558 |       @type google_cloud
559 | 
560 |       # Try to detect JSON formatted log entries.
561 |       detect_json true
562 |       # Collect metrics in Prometheus registry about plugin activity.
563 |       enable_monitoring true
564 |       monitoring_type prometheus
565 |       # Allow log entries from multiple containers to be sent in the same request.
566 |       split_logs_by_tag false
567 |       # Set the buffer type to file to improve the reliability and reduce the memory consumption
568 |       buffer_type file
569 |       buffer_path /var/run/google-fluentd/buffers/kubernetes.containers.buffer
570 |       # Set queue_full action to block because we want to pause gracefully
571 |       # in case of the off-the-limits load instead of throwing an exception
572 |       buffer_queue_full_action block
573 |       # Set the chunk limit conservatively to avoid exceeding the recommended
574 |       # chunk size of 5MB per write request.
575 |       buffer_chunk_limit 512k
576 |       # Cap the combined memory usage of this buffer and the one below to
577 |       # 512KiB/chunk * (6 + 2) chunks = 4 MiB
578 |       buffer_queue_limit 6
579 |       # Never wait more than 5 seconds before flushing logs in the non-error case.
580 |       flush_interval 5s
581 |       # Never wait longer than 30 seconds between retries.
582 |       max_retry_wait 30
583 |       # Disable the limit on the number of retries (retry forever).
584 |       disable_retry_limit
585 |       # Use multiple threads for processing.
586 |       num_threads 2
587 |       use_grpc true
588 |       k8s_cluster_name "#{ENV["CLUSTER_NAME"]}"
589 |       k8s_cluster_location "#{ENV["CLUSTER_LOCATION"]}"
590 |       # Skip timestamp adjustment as this is in a controlled environment with
591 |       # known timestamp format. This helps with CPU usage.
592 |       adjust_invalid_timestamps false
593 |     </match>
594 | 
595 |     # This section is exclusive for 'gvisor' logs. These logs come with tags
596 |     # `gvisor` tags.
597 |     # We use a separate output stanza for 'gvisor' logs with a smaller
598 |     # buffer because user's container application logs are more important.
599 |     <match gvisor>
600 |       @type google_cloud
601 |       detect_json true
602 |       enable_monitoring true
603 |       monitoring_type prometheus
604 |       # Allow log entries from multiple pods to be sent in the same request.
605 |       split_logs_by_tag false
606 |       buffer_type file
607 |       buffer_path /var/run/google-fluentd/buffers/kubernetes.pod.buffer
608 |       buffer_queue_full_action block
609 |       buffer_chunk_limit 512k
610 |       buffer_queue_limit 2
611 |       flush_interval 5s
612 |       max_retry_wait 30
613 |       disable_retry_limit
614 |       num_threads 2
615 |       use_grpc true
616 |       # Skip timestamp adjustment as this is in a controlled environment with
617 |       # known timestamp format. This helps with CPU usage.
618 |       adjust_invalid_timestamps false
619 |     </match>
620 | 
621 |     # Attach local_resource_id for 'k8s_node' monitored resource.
622 |     <filter **>
623 |       @type record_transformer
624 |       enable_ruby true
625 |       <record>
626 |         "logging.googleapis.com/local_resource_id" ${"k8s_node.#{ENV['NODE_NAME']}"}
627 |       </record>
628 |     </filter>
629 | 
630 |     # This section is exclusive for 'k8s_node' logs. These logs come with tags
631 |     # that are neither 'stderr' or 'stdout'.
632 |     # We use a separate output stanza for 'k8s_container' logs with a larger
633 |     # buffer because user's container logs are more important than node logs.
634 |     <match **>
635 |       @type google_cloud
636 | 
637 |       detect_json true
638 |       enable_monitoring true
639 |       monitoring_type prometheus
640 |       # Allow entries from multiple system logs to be sent in the same request.
641 |       split_logs_by_tag false
642 |       detect_subservice false
643 |       buffer_type file
644 |       buffer_path /var/run/google-fluentd/buffers/kubernetes.system.buffer
645 |       buffer_queue_full_action block
646 |       buffer_chunk_limit 512k
647 |       buffer_queue_limit 2
648 |       flush_interval 5s
649 |       max_retry_wait 30
650 |       disable_retry_limit
651 |       num_threads 2
652 |       use_grpc true
653 |       k8s_cluster_name "#{ENV["CLUSTER_NAME"]}"
654 |       k8s_cluster_location "#{ENV["CLUSTER_LOCATION"]}"
655 |       # Skip timestamp adjustment as this is in a controlled environment with
656 |       # known timestamp format. This helps with CPU usage.
657 |       adjust_invalid_timestamps false
658 |     </match>
659 | 
660 |   4.knative.input.conf: |-
661 |     # This configuration file for Fluentd is used to collect the logs located
662 |     # inside the /var/log directory of a Cloud Run on GKE / Knative container.
663 |     # Knative mounts a emptyDir volume named 'knative-var-log' inside the
664 |     # user container and if collection is enabled it creates a symbolic link
665 |     # inside another emptyDir named 'knative-internal' that contains the
666 |     # information needed for Kubernetes metadata enrichment.
667 |     #
668 |     # Concretely, on the host the symbolic link is:
669 |     # /var/lib/kubelet/pods/<POD_ID>/volumes/kubernetes.io~empty-dir/knative-internal/<NAMESPACE_NAME>_<POD_NAME>_<CONTAINER_NAME>
670 |     # ->
671 |     # /var/lib/kubelet/pods/<POD_ID>/volumes/kubernetes.io~empty-dir/knative-var-log
672 |     #
673 |     # The record reformer is used to extract pod_name, namespace_name and
674 |     # container_name from the tag and set them in a local_resource_id in the
675 |     # format of:
676 |     # 'k8s_container.<NAMESPACE_NAME>.<POD_NAME>.<CONTAINER_NAME>'.
677 |     # The reformer also sets the label 'source' to the path of the log file as seen
678 |     # from the user container point of view to be able to identify the source of
679 |     # a log entry.
680 |     <source>
681 |       @type tail
682 |       # **/*/**/* allows path expansion to go through the symbolic link and then recursively through /var/log
683 |       path /var/lib/kubelet/pods/*/volumes/kubernetes.io~empty-dir/knative-internal/**/*/**/*
684 |       # Save the path of the file as field 'source'
685 |       path_key source
686 |       pos_file /var/run/google-fluentd/pos-files/knative-var-log.pos
687 |       # Tags at this point are in the format of:
688 |       # knative.reform.var.lib.kubelet.pods.<POD_ID>.volumes.kubernetes.io~empty-dir.knative-internal.<NAMESPACE_NAME>_<POD_NAME>_<CONTAINER_NAME>.<PATH_INSIDE_VAR_LOG>
689 |       tag knative.reform.*
690 |       read_from_head true
691 |       <parse>
692 |         @type multi_format
693 |         <pattern>
694 |           format json
695 |           time_key time
696 |           time_format %Y-%m-%dT%H:%M:%S.%NZ
697 |         </pattern>
698 |         <pattern>
699 |           format /^(?<time>.+) (?<stream>stdout|stderr) [^ ]* (?<log>.*)$/
700 |           time_format %Y-%m-%dT%H:%M:%S.%N%:z
701 |         </pattern>
702 |         <pattern>
703 |           format none
704 |           message_key log
705 |         </pattern>
706 |       </parse>
707 |     </source>
708 | 
709 |     <filter knative.reform.**>
710 |       @type parser
711 |       format /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<log>.*)/
712 |       reserve_data true
713 |       suppress_parse_error_log true
714 |       emit_invalid_record_to_error false
715 |       key_name log
716 |     </filter>
717 | 
718 |     <filter knative.reform.**>
719 |       # This plugin uses environment variables KUBERNETES_SERVICE_HOST and
720 |       # KUBERNETES_SERVICE_PORT to talk to the API server. These environment
721 |       # variables are added by kubelet automatically.
722 |       @type kubernetes_metadata
723 |       # Interval in seconds to dump cache stats locally in the Fluentd log.
724 |       stats_interval 300
725 |       # TTL in seconds of each cached element.
726 |       cache_ttl 30
727 |       # Custom regex to extract the fields
728 |       tag_to_kubernetes_name_regexp (?<docker_id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})\.volumes.kubernetes\.io~empty-dir\.knative-internal\.(?<namespace>[^_]+)_(?<pod_name>[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<container_name>user-container)\..*?$
729 |     </filter>
730 | 
731 |     <filter knative.reform.**>
732 |       # We have to use record_modifier because only this plugin supports complex
733 |       # logic to modify record the way we need.
734 |       @type record_modifier
735 |       enable_ruby true
736 |       <record>
737 |         # Extract "kubernetes"->"labels" and set them as
738 |         # "logging.googleapis.com/labels". Prefix these labels with
739 |         # "k8s-pod" to distinguish with other labels and avoid
740 |         # label name collision with other types of labels.
741 |         _dummy_ ${if record.is_a?(Hash) && record.has_key?('kubernetes') && record['kubernetes'].has_key?('labels') && record['kubernetes']['labels'].is_a?(Hash); then; record['logging.googleapis.com/labels'] = record['kubernetes']['labels'].map{ |k, v| ["k8s-pod/#{k}", v]}.to_h; end; nil}
742 |       </record>
743 |       # Delete this dummy field and the rest of "kubernetes" and "docker".
744 |       remove_keys _dummy_,kubernetes,docker
745 |     </filter>
746 | 
747 |     <match knative.reform.**>
748 |       @type record_reformer
749 |       enable_ruby true
750 |       <record>
751 |         # Extract local_resource_id from tag for 'k8s_container' monitored
752 |         # resource. The format is:
753 |         # 'k8s_container.<namespace_name>.<pod_name>.<container_name>'.
754 |         "logging.googleapis.com/local_resource_id" ${"k8s_container.#{tag_parts[11].gsub('_', '.')}"}
755 |         # Rename the field 'log' to a more generic field 'message'. This way the
756 |         # fluent-plugin-google-cloud knows to flatten the field as textPayload
757 |         # instead of jsonPayload after extracting 'time', 'severity' and
758 |         # 'stream' from the record.
759 |         message ${record['log']}
760 |         # If 'severity' is not set, assume it is INFO.
761 |         severity ${record['severity'] || 'INFO'}
762 |         # Set 'source' label to the path of the log file as seen from the Knative container point of view
763 |         _dummy_ ${record["logging.googleapis.com/labels"]["source"] = "/var/log/" + record["source"].scan(/\/knative-internal\/[^\/]+\/(.*)/).last.last}
764 |       </record>
765 |       tag knative.stdout
766 |       remove_keys _dummy_,log,source
767 |     </match>
768 | 
769 |     # Detect exceptions in the log output and forward them as one log entry.
770 |     <match knative.stdout>
771 |       @type detect_exceptions
772 |       remove_tag_prefix knative
773 |       message message
774 |       stream "logging.googleapis.com/local_resource_id"
775 |       multiline_flush_interval 5
776 |       max_bytes 500000
777 |       max_lines 1000
778 |     </match>
779 | ---
780 | # Config map for Logging Agent output and corresponding filter plugins.
781 | apiVersion: v1
782 | kind: ConfigMap
783 | metadata:
784 |   name: logging-agent-output-config
785 |   namespace: stackdriver-agents
786 | data:
787 |   google-fluentd.conf: |-
788 |     @include config.d/*.conf
789 | 


--------------------------------------------------------------------------------
/metadata-agent.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   labels:
 5 |     app: stackdriver-metadata-agent
 6 |     cluster-level: "true"
 7 |   name: stackdriver-metadata-agent-cluster-level
 8 |   namespace: stackdriver-agents
 9 | spec:
10 |   replicas: 1
11 |   selector:
12 |     matchLabels:
13 |       app: stackdriver-metadata-agent
14 |       cluster-level: "true"
15 |   template:
16 |     metadata:
17 |       labels:
18 |         app: stackdriver-metadata-agent
19 |         cluster-level: "true"
20 |     spec:
21 |       containers:
22 |       - env:
23 |         - name: CLUSTER_NAME
24 |           valueFrom:
25 |             configMapKeyRef:
26 |               name: cluster-config
27 |               key: cluster_name
28 |         - name: CLUSTER_LOCATION
29 |           valueFrom:
30 |             configMapKeyRef:
31 |               name: cluster-config
32 |               key: cluster_location
33 |         - name: GOOGLE_APPLICATION_CREDENTIALS
34 |           valueFrom:
35 |             configMapKeyRef:
36 |               name: google-cloud-config
37 |               key: credentials_path
38 |         - name: PROMETHEUS_PORT
39 |           value: "8888"
40 |         args:
41 |         - -logtostderr
42 |         - -v=1
43 |         image: gcr.io/stackdriver-agents/metadata-agent-go:1.3.0
44 |         imagePullPolicy: IfNotPresent
45 |         name: metadata-agent
46 |         resources:
47 |           requests:
48 |             cpu: 40m
49 |             memory: 50Mi
50 |         ports:
51 |         - name: metadata-agent
52 |           containerPort: 8888
53 |         terminationMessagePath: /dev/termination-log
54 |         terminationMessagePolicy: File
55 |         volumeMounts:
56 |         - mountPath: /etc/google-cloud/
57 |           name: google-cloud-config
58 |         - mountPath: /etc/ssl/certs
59 |           name: ssl-certs
60 |       dnsPolicy: ClusterFirst
61 |       restartPolicy: Always
62 |       schedulerName: default-scheduler
63 |       securityContext: {}
64 |       serviceAccount: metadata-agent
65 |       serviceAccountName: metadata-agent
66 |       tolerations:
67 |       - operator: "Exists"
68 |         effect: "NoExecute"
69 |       - operator: "Exists"
70 |         effect: "NoSchedule"
71 |       terminationGracePeriodSeconds: 5
72 |       volumes:
73 |       - configMap:
74 |           defaultMode: 420
75 |           name: google-cloud-config
76 |         name: google-cloud-config
77 |       - hostPath:
78 |           path: /etc/ssl/certs
79 |           type: Directory
80 |         name: ssl-certs
81 |   strategy:
82 |     rollingUpdate:
83 |       maxUnavailable: 1
84 |     type: RollingUpdate
85 | 


--------------------------------------------------------------------------------
/rbac-setup.yaml:
--------------------------------------------------------------------------------
  1 | # How to apply this YAML file:
  2 | # $ kubectl apply -f rbac-setup.yaml --as=admin --as-group=system:masters
  3 | #
  4 | # Namespace for Stackdriver Agents related components.
  5 | apiVersion: v1
  6 | kind: Namespace
  7 | metadata:
  8 |   name: stackdriver-agents
  9 | ---
 10 | # Config map for setting GOOGLE_APPLICATION_CREDENTIALS.
 11 | apiVersion: v1
 12 | data:
 13 |   credentials_path: ""
 14 | kind: ConfigMap
 15 | metadata:
 16 |   name: google-cloud-config
 17 |   namespace: stackdriver-agents
 18 | ---
 19 | # Config map for setting CLUSTER_NAME and CLUSTER_LOCATION env vars.
 20 | apiVersion: v1
 21 | data:
 22 |   cluster_name: ""
 23 |   cluster_location: ""
 24 | kind: ConfigMap
 25 | metadata:
 26 |   name: cluster-config
 27 |   namespace: stackdriver-agents
 28 | ---
 29 | # Service account for Metadata Agent.
 30 | apiVersion: v1
 31 | kind: ServiceAccount
 32 | metadata:
 33 |   name: metadata-agent
 34 |   namespace: stackdriver-agents
 35 | ---
 36 | # ClusterRole with permissions required by Metadata Agent.
 37 | apiVersion: rbac.authorization.k8s.io/v1
 38 | kind: ClusterRole
 39 | metadata:
 40 |   name: stackdriver-user:metadata-agent
 41 |   namespace: stackdriver-agents
 42 | rules:
 43 | - apiGroups:
 44 |   - '*'
 45 |   resources:
 46 |   - '*'
 47 |   verbs:
 48 |   - watch
 49 |   - get
 50 |   - list
 51 | ---
 52 | # ClusterRoleBinding for Metadata Agent.
 53 | apiVersion: rbac.authorization.k8s.io/v1
 54 | kind: ClusterRoleBinding
 55 | metadata:
 56 |   name: stackdriver-user:metadata-agent
 57 |   namespace: stackdriver-agents
 58 | roleRef:
 59 |   apiGroup: rbac.authorization.k8s.io
 60 |   kind: ClusterRole
 61 |   name: stackdriver-user:metadata-agent
 62 | subjects:
 63 | - kind: ServiceAccount
 64 |   name: metadata-agent
 65 |   namespace: stackdriver-agents
 66 | ---
 67 | # Service account for Logging Agent.
 68 | apiVersion: v1
 69 | kind: ServiceAccount
 70 | metadata:
 71 |   name: logging-agent
 72 |   namespace: stackdriver-agents
 73 | ---
 74 | # ClusterRole with permissions required by Logging Agent
 75 | # filter_kubernetes_metadata plugin.
 76 | apiVersion: rbac.authorization.k8s.io/v1
 77 | kind: ClusterRole
 78 | metadata:
 79 |   name: stackdriver-user:logging-agent
 80 |   namespace: stackdriver-agents
 81 | rules:
 82 | - apiGroups:
 83 |   - ""
 84 |   resources:
 85 |   - pods
 86 |   - namespaces
 87 |   verbs:
 88 |   - watch
 89 |   - get
 90 |   - list
 91 | ---
 92 | # ClusterRoleBinding for Logging Agent.
 93 | apiVersion: rbac.authorization.k8s.io/v1
 94 | kind: ClusterRoleBinding
 95 | metadata:
 96 |   name: stackdriver-user:logging-agent
 97 |   namespace: stackdriver-agents
 98 | roleRef:
 99 |   apiGroup: rbac.authorization.k8s.io
100 |   kind: ClusterRole
101 |   name: stackdriver-user:logging-agent
102 | subjects:
103 | - kind: ServiceAccount
104 |   name: logging-agent
105 |   namespace: stackdriver-agents
106 | ---
107 | # Service account for Heapster.
108 | apiVersion: v1
109 | kind: ServiceAccount
110 | metadata:
111 |   name: heapster
112 |   namespace: stackdriver-agents
113 | ---
114 | # ClusterRole with permissions required by Heapster.
115 | apiVersion: rbac.authorization.k8s.io/v1
116 | kind: ClusterRole
117 | metadata:
118 |   name: stackdriver-user:heapster
119 |   namespace: stackdriver-agents
120 | rules:
121 | - apiGroups:
122 |   - ""
123 |   resources:
124 |   - events
125 |   - namespaces
126 |   - nodes
127 |   - nodes/stats
128 |   - pods
129 |   verbs:
130 |   - get
131 |   - list
132 |   - watch
133 | - apiGroups:
134 |   - extensions
135 |   resources:
136 |   - deployments
137 |   verbs:
138 |   - get
139 |   - list
140 |   - watch
141 | ---
142 | # ClusterRoleBinding for Heapster.
143 | apiVersion: rbac.authorization.k8s.io/v1
144 | kind: ClusterRoleBinding
145 | metadata:
146 |   name: stackdriver-user:heapster
147 |   namespace: stackdriver-agents
148 | roleRef:
149 |   apiGroup: rbac.authorization.k8s.io
150 |   kind: ClusterRole
151 |   name: stackdriver-user:heapster
152 | subjects:
153 | - kind: ServiceAccount
154 |   name: heapster
155 |   namespace: stackdriver-agents
156 | 


--------------------------------------------------------------------------------