├── .gitignore ├── 00_intro-fixed.yaml ├── 00_intro.yaml ├── 01_pp_image-fixed.yaml ├── 01_pp_image.yaml ├── 02_pp_oomer-fixed.yaml ├── 02_pp_oomer.yaml ├── 03_pp_logs.yaml ├── 04_storage-failedmount-fixed.yaml ├── 04_storage-failedmount.yaml ├── 05_network-wrongsel-fixed.yaml ├── 05_network-wrongsel.yaml ├── CNAME ├── LICENSE ├── README.md ├── _config.yml ├── _layouts └── default.html ├── assets └── css │ └── style.scss ├── favicon.ico └── img ├── chaoskube-in-action.png ├── jaeger-overview.png ├── linkerd2-overview.png ├── pod-lifecycle-inline.png └── pod-lifecycle.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, build with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | .DS_Store 14 | -------------------------------------------------------------------------------- /00_intro-fixed.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: unhappy-camper 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: whatever 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: whatever 14 | spec: 15 | containers: 16 | - name: shell 17 | image: centos:7 18 | command: 19 | - sh 20 | - '-c' 21 | - tail -f /dev/null 22 | -------------------------------------------------------------------------------- /00_intro.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: unhappy-camper 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: whatever 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: whatever 14 | spec: 15 | containers: 16 | - name: shell 17 | image: centos:7 18 | command: 19 | - sh 20 | - '-c' 21 | - echo "I will just print something here and then exit" 22 | -------------------------------------------------------------------------------- /01_pp_image-fixed.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: confused-imager 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: whatever 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: whatever 14 | spec: 15 | containers: 16 | - name: something 17 | image: nginx:1.19.0 -------------------------------------------------------------------------------- /01_pp_image.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: confused-imager 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: whatever 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: whatever 14 | spec: 15 | containers: 16 | - name: something 17 | image: simpleservice:0.5.0 -------------------------------------------------------------------------------- /02_pp_oomer-fixed.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: wegotan-oomer 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: oomer 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: oomer 14 | spec: 15 | containers: 16 | - name: greedymuch 17 | image: centos:7 18 | command: 19 | - sh 20 | - '-c' 21 | - "sleep 5 && yes | tr \\n x | head -c 500m | grep n && sleep 1000" 22 | resources: 23 | limits: 24 | memory: 600M 25 | - name: shell 26 | image: centos:7 27 | command: 28 | - sh 29 | - '-c' 30 | - sleep 1000 -------------------------------------------------------------------------------- /02_pp_oomer.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: wegotan-oomer 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: oomer 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: oomer 14 | spec: 15 | containers: 16 | - name: greedymuch 17 | image: centos:7 18 | command: 19 | - sh 20 | - '-c' 21 | - "sleep 5 && yes | tr \\n x | head -c 500m | grep n && sleep 1000" 22 | resources: 23 | limits: 24 | memory: 400M 25 | - name: shell 26 | image: centos:7 27 | command: 28 | - sh 29 | - '-c' 30 | - sleep 1000 -------------------------------------------------------------------------------- /03_pp_logs.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: hiccup 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: hiccup 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: hiccup 14 | spec: 15 | containers: 16 | - name: theapp 17 | image: centos:7 18 | command: 19 | - sh 20 | - '-c' 21 | - "for x in {1..20}; do echo doing some good work here in $x ; sleep 1; done; echo bye for now; exit" -------------------------------------------------------------------------------- /04_storage-failedmount-fixed.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: wheresmyvolume 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: wheresmyvolume 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: wheresmyvolume 14 | spec: 15 | containers: 16 | - name: writer 17 | image: centos:7 18 | command: 19 | - sh 20 | - '-c' 21 | - "printf 'some important data to persist' > /tmp/out/data; sleep 10000" 22 | volumeMounts: 23 | - name: xchange 24 | mountPath: /tmp/out 25 | - name: reader 26 | image: centos:7 27 | command: 28 | - sh 29 | - '-c' 30 | - "ls -la /tmp/in/; sleep 10000" 31 | volumeMounts: 32 | - name: xchange 33 | mountPath: /tmp/in 34 | volumes: 35 | - name: xchange 36 | emptyDir: {} -------------------------------------------------------------------------------- /04_storage-failedmount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: wheresmyvolume 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: wheresmyvolume 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: wheresmyvolume 14 | spec: 15 | containers: 16 | - name: writer 17 | image: centos:7 18 | command: 19 | - sh 20 | - '-c' 21 | - "printf 'some important data to persist' > /tmp/out/data; sleep 10000" 22 | volumeMounts: 23 | - name: xchange 24 | mountPath: /tmp/out 25 | - name: reader 26 | image: centos:7 27 | command: 28 | - sh 29 | - '-c' 30 | - "cat /tmp/in/data; sleep 10000" 31 | volumeMounts: 32 | - name: xchange 33 | mountPath: /tmp/data 34 | volumes: 35 | - name: xchange 36 | emptyDir: {} -------------------------------------------------------------------------------- /05_network-wrongsel-fixed.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: webserver 5 | spec: 6 | ports: 7 | - port: 80 8 | targetPort: 80 9 | selector: 10 | run: webserver -------------------------------------------------------------------------------- /05_network-wrongsel.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: webserver 5 | spec: 6 | ports: 7 | - port: 80 8 | targetPort: 80 9 | selector: 10 | app: webserver -------------------------------------------------------------------------------- /CNAME: -------------------------------------------------------------------------------- 1 | troubleshooting.kubernetes.sh -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting Kubernetes Applications 2 | 3 | _Table of contents:_ 4 | 5 | Preparation | Intro | Poking pods 6 | --- | --- | --- 7 | Storage | Network | Security 8 | Observability | Vaccination | References 9 | 10 | To demonstrate the different issues and failures as well as how to fix them, I've been using the commands and resources as shown below. 11 | 12 | _NOTE_: whenever you see a 📄 icon, it means this is a reference to the official Kubernetes [docs](https://kubernetes.io/docs/). 13 | 14 | ## Prerequisits 15 | 16 | - Kubernetes 1.16 or higher 17 | 18 | ## Preparation 19 | 20 | Before starting, set up: 21 | 22 | ``` 23 | # create the namespace we'll be operating in: 24 | kubectl create ns vnyc 25 | 26 | # in different tmux pane keep an eye on the resources: 27 | watch kubectl -n vnyc get all 28 | ``` 29 | 30 | ## Intro 31 | 32 | Using [00_intro.yaml](00_intro.yaml): 33 | 34 | ``` 35 | kubectl -n vnyc apply -f 00_intro.yaml 36 | 37 | kubectl -n vnyc describe deploy/unhappy-camper 38 | 39 | THEPOD=$(kubectl -n vnyc get po -l=app=whatever --output=jsonpath={.items[*].metadata.name}) 40 | kubectl -n vnyc describe po/$THEPOD 41 | kubectl -n vnyc logs $THEPOD 42 | kubectl -n vnyc exec -it $THEPOD -- sh 43 | 44 | kubectl -n vnyc delete deploy/unhappy-camper 45 | ``` 46 | 47 | ## Poking pods 48 | 49 | ### Pod lifecycle 50 | 51 |  52 | _Download [in original resolution](https://github.com/mhausenblas/troubleshooting-k8s-apps/raw/master/img/pod-lifecycle.png)._ 53 | 54 | 55 | ### Image issue 56 | 57 | Using [01_pp_image.yaml](01_pp_image.yaml): 58 | 59 | ``` 60 | # let's deploy a confused image and look for the error: 61 | kubectl -n vnyc apply -f 01_pp_image.yaml 62 | kubectl -n vnyc get events | grep confused | grep Error 63 | 64 | # fix it by specifying the correct image: 65 | kubectl -n vnyc patch deployment confused-imager \ 66 | --patch '{ "spec" : { "template" : { "spec" : { "containers" : [ { "name" : "something" , "image" : "mhausenblas/simpleservice:0.5.0" } ] } } } }' 67 | 68 | kubectl -n vnyc delete deploy/confused-imager 69 | ``` 70 | 71 | Relevant real-world examples on StackOverflow: 72 | 73 | - [Kubernetes how to debug CrashLoopBackoff](https://stackoverflow.com/questions/44673957/kubernetes-how-to-debug-crashloopbackoff) 74 | - [Kubernetes imagePullSecrets not working; getting “image not found”](https://stackoverflow.com/questions/32510310/kubernetes-imagepullsecrets-not-working-getting-image-not-found) 75 | - [Trying to create a Kubernetes deployment but it shows 0 pods available](https://stackoverflow.com/questions/51139988/trying-to-create-a-kubernetes-deployment-but-it-shows-0-pods-available) 76 | 77 | ### Keeps crashing 78 | 79 | Using [02_pp_oomer.yaml](02_pp_oomer.yaml) and [02_pp_oomer-fixed.yaml](02_pp_oomer-fixed.yaml): 80 | 81 | ``` 82 | # prepare a greedy fellow that will OOM: 83 | kubectl -n vnyc apply -f 02_pp_oomer.yaml 84 | 85 | # wait > 5s and then check mem in container: 86 | kubectl -n vnyc exec -it $(kubectl -n vnyc get po -l=app=oomer --output=jsonpath={.items[*].metadata.name}) -c greedymuch -- cat /sys/fs/cgroup/memory/memory.limit_in_bytes /sys/fs/cgroup/memory/memory.usage_in_bytes 87 | 88 | 89 | kubectl -n vnyc describe po $(kubectl -n vnyc get po -l=app=oomer --output=jsonpath={.items[*].metadata.name}) 90 | 91 | # fix the issue: 92 | kubectl -n vnyc apply -f 02_pp_oomer-fixed.yaml 93 | 94 | # wait > 20s 95 | kubectl -n vnyc exec -it $(kubectl -n vnyc get po -l=app=oomer --output=jsonpath={.items[*].metadata.name}) -c greedymuch -- cat /sys/fs/cgroup/memory/memory.limit_in_bytes /sys/fs/cgroup/memory/memory.usage_in_bytes 96 | 97 | kubectl -n vnyc delete deploy wegotan-oomer 98 | ``` 99 | 100 | Relevant real-world examples on StackOverflow: 101 | 102 | - [InfluxDB container dies over time, and can't restart](https://stackoverflow.com/questions/37877432/influxdb-container-dies-over-time-and-cant-restart) 103 | - [AWS deployment with kubernetes 1.7.2 continuously running in pod getting killed and restarted](https://stackoverflow.com/questions/47849502/aws-deployment-with-kubernetes-1-7-2-continuously-running-in-pod-getting-killed) 104 | 105 | ### Something's wrong with the app 106 | 107 | Using [03_pp_logs.yaml](03_pp_logs.yaml): 108 | 109 | ``` 110 | kubectl -n vnyc apply -f 03_pp_logs.yaml 111 | 112 | # nothing to see here: 113 | kubectl -n vnyc describe deploy/hiccup 114 | 115 | # but I see it in the logs: 116 | kubectl -n vnyc logs --follow $(kubectl -n vnyc get po -l=app=hiccup --output=jsonpath={.items[*].metadata.name}) 117 | 118 | kubectl -n vnyc delete deploy hiccup 119 | ``` 120 | 121 | Relevant real-world examples on StackOverflow: 122 | 123 | - [My kubernetes pods keep crashing with “CrashLoopBackOff” but I can't find any log](https://stackoverflow.com/questions/41604499/my-kubernetes-pods-keep-crashing-with-crashloopbackoff-but-i-cant-find-any-lo) 124 | - [Kubernetes Readiness probe failed error](https://stackoverflow.com/questions/48540929/kubernetes-readiness-probe-failed-error) 125 | - [Kubernetes error: validating data found invalid field env for v1 PodSpec](https://stackoverflow.com/questions/43532990/kubernetes-error-validating-data-found-invalid-field-env-for-v1-podspec) 126 | 127 | References: 128 | 129 | - [Debugging microservices - Squash vs. Telepresence](https://www.weave.works/blog/debugging-microservices-squash-vs-telepresence) 130 | - [Debugging and Troubleshooting Microservices in Kubernetes with Ray Tsang (Google)](https://www.weave.works/blog/debugging-and-troubleshooting-microservices-in-kubernetes) 131 | - [Troubleshooting Kubernetes Using Logs](https://blog.papertrailapp.com/troubleshoot-kubernetes-using-logs/) 132 | 133 | ## Storage 134 | 135 | Using [04_storage-failedmount.yaml](04_storage-failedmount.yaml) and [04_storage-failedmount-fixed.yaml](04_storage-failedmount-fixed.yaml): 136 | 137 | ``` 138 | kubectl -n vnyc apply -f 04_storage-failedmount.yaml 139 | 140 | # has the data been written? 141 | kubectl -n vnyc exec -it $(kubectl -n vnyc get po -l=app=wheresmyvolume --output=jsonpath={.items[*].metadata.name}) -c writer -- cat /tmp/out/data 142 | 143 | # has the data been read in? 144 | kubectl -n vnyc exec -it $(kubectl -n vnyc get po -l=app=wheresmyvolume --output=jsonpath={.items[*].metadata.name}) -c reader -- cat /tmp/in/data 145 | 146 | kubectl -n vnyc describe po $(kubectl -n vnyc get po -l=app=wheresmyvolume --output=jsonpath={.items[*].metadata.name}) 147 | 148 | kubectl -n vnyc apply -f 04_storage-failedmount-fixed.yaml 149 | 150 | kubectl -n vnyc delete deploy wheresmyvolume 151 | ``` 152 | 153 | Relevant real-world examples on StackOverflow: 154 | 155 | - [How to find out why mounting an emptyDir volume fails in Kubernetes?](https://stackoverflow.com/questions/51206154/how-to-find-out-why-mounting-an-emptydir-volume-fails-in-kubernetes) 156 | - [Kubernetes NFS volume mount fail with exit status 32](https://stackoverflow.com/questions/34113569/kubernetes-nfs-volume-mount-fail-with-exit-status-32) 157 | 158 | References: 159 | 160 | - [Debugging Kubernetes PVCs](https://itnext.io/debugging-kubernetes-pvcs-a150f5efbe95) 161 | - Further references see [Stateful Kubernetes](https://stateful.kubernetes.sh/) 162 | 163 | ## Network 164 | 165 | Using [05_network-wrongsel.yaml](05_network-wrongsel.yaml) and [05_network-wrongsel-fixed.yaml](05_network-wrongsel-fixed.yaml): 166 | 167 | ``` 168 | kubectl -n vnyc run webserver --image nginx --port 80 169 | 170 | kubectl -n vnyc apply -f 05_network-wrongsel.yaml 171 | 172 | kubectl -n vnyc run -it --rm debugpod --restart=Never --image=centos:7 -- curl webserver.vnyc 173 | 174 | kubectl -n vnyc run -it --rm debugpod --restart=Never --image=centos:7 -- ping webserver.vnyc 175 | 176 | kubectl -n vnyc run -it --rm debugpod --restart=Never --image=centos:7 -- ping $(kubectl -n vnyc get po -l=run=webserver --output=jsonpath={.items[*].status.podIP}) 177 | 178 | kubectl -n vnyc apply -f 05_network-wrongsel-fixed.yaml 179 | 180 | kubectl -n vnyc delete deploy webserver 181 | ``` 182 | 183 | Other scenarios often found: 184 | 185 | - See an error message that says something like `connection refused`? You could be hitting the `127.0.0.1` issue with the solution to make the app listen on `0.0.0.0` rather than on localhost. Further, see also some discussion [here](https://superuser.com/questions/949428/whats-the-difference-between-127-0-0-1-and-0-0-0-0). 186 | - Missing firewall rules, from cluster-internal open ports to communication between clusters can cause all kinds of issues. It very much depends on the environment (AWS, Azure, GCP, on-premises, etc.) how exactly you go about it and most certainly is an infra admin task rather than an appops task. 187 | - Taking a pod offline for debugging: on the pod, simply remove the relevant label(s) the service uses in its `selector` and that removes the pod from the pool of endpoints the service has to serve traffic to while leaving the pod running, ready for you to `kubectl exec -it` in. 188 | 189 | Relevant real-world examples on StackOverflow: 190 | 191 | - [Connection Refused error when connecting to Kubernetes Redis Service](https://stackoverflow.com/questions/48597726/connection-refused-error-when-connecting-to-kubernetes-redis-service/) 192 | - [“kubectl get pods” showing STATUS - ImagePullbackOff](https://stackoverflow.com/questions/51164795/kubectl-get-pods-showing-status-imagepullbackoff) 193 | - [Service not exposing in kubernetes](https://stackoverflow.com/questions/51662015/service-not-exposing-in-kubernetes) 194 | - [Kubernetes: Can not curl minikube pod](https://stackoverflow.com/questions/52289583/kubernetes-can-not-curl-minikube-pod/52289956) 195 | 196 | References: 197 | 198 | - [Debug Services](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-service/) 📄 199 | - [Troubleshooting Kubernetes Networking Issues](https://gravitational.com/blog/troubleshooting-kubernetes-networking/) 200 | - Further references see [Container Networking](https://mhausenblas.info/cn-ref/) 201 | 202 | ## Security 203 | 204 | ``` 205 | kubectl -n vnyc create sa prober 206 | kubectl -n vnyc run -it --rm probepod --serviceaccount=prober --restart=Never --image=centos:7 -- sh 207 | 208 | # in the container; will result in an 403, b/c we don't have the permissions necessary: 209 | export CURL_CA_BUNDLE=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt 210 | APISERVERTOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) 211 | curl -H "Authorization: Bearer $APISERVERTOKEN" https://kubernetes.default/api/v1/namespaces/vnyc/pods 212 | 213 | # different tmux pane, verify if the SA actually is allowed to: 214 | kubectl -n vnyc auth can-i list pods --as=system:serviceaccount:vnyc:prober 215 | 216 | # … seems not to be the case, so give sufficient permissions: 217 | kubectl create clusterrole podreader \ 218 | --verb=get --verb=list \ 219 | --resource=pods 220 | 221 | kubectl -n vnyc create rolebinding allowpodprobes \ 222 | --clusterrole=podreader \ 223 | --serviceaccount=vnyc:prober \ 224 | --namespace=vnyc 225 | 226 | # clean up 227 | kubectl delete clusterrole podreader && kubectl delete ns vnyc 228 | ``` 229 | 230 | Relevant real-world examples on StackOverflow: 231 | 232 | - [Accessing Kubernetes API from pod fails although roles are configured is configured](https://stackoverflow.com/questions/52095161/accessing-kubernetes-api-from-pod-fails-although-roles-are-configured-is-configu) 233 | - [How to deploy a deployment in another namespace in Kubernetes?](https://stackoverflow.com/questions/52297676/how-to-deploy-a-deployment-in-another-namespace-in-kubernetes/52298101#52298101) 234 | - [Unable to read my newly created Kubernetes secret](https://stackoverflow.com/questions/51418711/unable-to-read-my-newly-created-kubernetes-secret/51419033#51419033) 235 | 236 | References see [kubernetes-security.info](https://kubernetes-security.info/). 237 | 238 | ## Observability 239 | 240 | From metrics ([Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/)) to logs (EFK/ELK stack) to tracing ([OpenCensus](https://opencensus.io/) and [OpenTracing](http://opentracing.io/)). 241 | 242 | ### Service ops in practice 243 | 244 | Show [Linkerd 2.0 in action](https://medium.com/@mhausenblas/linkerd-2-0-service-ops-for-you-and-me-281cc5bd6424) using [this Katacoda scenario](https://www.katacoda.com/mhausenblas/scenarios/linkerd2) as a starting point. 245 | 246 |  247 | 248 | ### Distributed tracing and debugging 249 | 250 | Show [Jaeger 1.6 in action](https://www.jaegertracing.io/docs/1.6/getting-started/) using [this Katacoda scenario](https://katacoda.com/opentracing/scenarios/golang-hotrod-demo). 251 | 252 |  253 | 254 | References: 255 | 256 | - [Logs and Metrics](https://medium.com/@copyconstruct/logs-and-metrics-6d34d3026e38) 257 | - [Evolution of Monitoring and Prometheus](https://www.slideshare.net/brianbrazil/evolution-of-monitoring-and-prometheus-dublin-2018) 258 | - [The life of a span](https://medium.com/jaegertracing/the-life-of-a-span-ee508410200b) 259 | - [Distributed Tracing with Jaeger & Prometheus on Kubernetes](https://blog.openshift.com/openshift-commons-briefing-82-distributed-tracing-with-jaeger-prometheus-on-kubernetes/) 260 | - Debugging: [KubeSquash](https://github.com/solo-io/kubesquash) 261 | 262 | ## Vaccination 263 | 264 | Show [chaoskube](https://github.com/linki/chaoskube) in action, killing off random pods in the `vnyc` namespace. 265 | 266 | We have the following setup: 267 | 268 | ``` 269 | +----------------+ 270 | | | 271 | +-----> | webserver/pod1 | 272 | | | | 273 | +----------------+ | +----------------+ 274 | | | | +----------------+ 275 | | appserver/pod1 +--------+ +---------+ | | | 276 | | | | +--+ | +-----> | webserver/pod2 | 277 | +----------------+ | X | | | | 278 | | X | | +----------------+ 279 | | X | | +----------------+ 280 | v X | | | | 281 | X svc/webserver +--------> | webserver/pod3 | 282 | ^ X | | | | 283 | +----------------+ | X | | +----------------+ 284 | | | | X | | +----------------+ 285 | | appserver/pod2 +--------+ X | | | | 286 | | | +--+ | +-----> | webserver/pod4 | 287 | +----------------+ +----------+ | | | 288 | | +----------------+ 289 | | +----------------+ 290 | | | | 291 | +-----> | webserver/pod5 | 292 | | | 293 | +----------------+ 294 | ``` 295 | 296 | That is, a `webserver` running with five replicas along with a service as well as an `appserver` running with two replicas that queries said service. 297 | 298 | ``` 299 | # let's create our victims, that is webservers and appservers: 300 | kubectl create ns vnyc 301 | kubectl -n vnyc run webserver --image nginx --port 80 --replicas 5 302 | kubectl -n vnyc expose deploy/webserver 303 | kubectl -n vnyc run appserver --image centos:7 --replicas 2 -- sh -c "while true; do curl webserver ; sleep 10 ; done" 304 | kubectl -n vnyc logs deploy/appserver --follow 305 | 306 | # also keep on the events generated: 307 | kubectl -n vnyc get events --watch 308 | 309 | # now release the chaos monkey: 310 | chaoskube \ 311 | --interval 30s \ 312 | --namespaces 'vnyc' \ 313 | --no-dry-run 314 | 315 | kubectl delete ns vnyc 316 | ``` 317 | 318 | And here's a screen shot of `chaoskube` in action, with all the above commands applied: 319 | 320 |  321 | 322 | References: 323 | 324 | - [Kubernetes: five steps to well-behaved apps](https://medium.com/@betz.mark/kubernetes-five-steps-to-well-behaved-apps-a7cbeb99471a) 325 | - [Kubernetes Best Practices](https://medium.com/google-cloud/kubernetes-best-practices-8d5cd03446e2) 326 | - [Developing on Kubernetes](https://kubernetes.io/blog/2018/05/01/developing-on-kubernetes/) 327 | - [Kubernetes Application Operator Basics](https://blog.openshift.com/kubernetes-application-operator-basics/) 328 | - [Using chaoskube with OpenEBS](https://blog.openebs.io/chaos-engineering-on-openebs-7d4e0f995545) 329 | - Tooling: 330 | - [linki/chaoskube](https://github.com/linki/chaoskube) 331 | - [asobti/kube-monkey](https://github.com/asobti/kube-monkey) 332 | - [bloomberg/powerfulseal](https://github.com/bloomberg/powerfulseal) 333 | - [AlexsJones/k8aos](https://github.com/AlexsJones/k8aos) 334 | - [jnewland/kubernetes-pod-chaos-monkey](https://github.com/jnewland/kubernetes-pod-chaos-monkey) 335 | 336 | ## References 337 | 338 | ### General 339 | 340 | - [Troubleshoot Applications](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-application/) 📄 341 | - [Troubleshoot Clusters](https://kubernetes.io/docs/tasks/debug-application-cluster/debug-cluster/) 📄 342 | - A site dedicated to [Kubernetes Troubleshooting](https://kubernetes.feisky.xyz/en/troubleshooting/) 343 | - [Debug a Go Application in Kubernetes from IDE](https://itnext.io/debug-a-go-application-in-kubernetes-from-ide-c45ad26d8785) 344 | - CrashLoopBackoff, Pending, FailedMount and Friends: Debugging Common Kubernetes Cluster (KubeCon NA 2017): [video](https://www.youtube.com/watch?v=7FOCG5kua1w) and [slide deck](https://afontofuseless.info/debugging-kubernetes-app-deploys-kc2017/) 345 | - 10 Most Common Reasons Kubernetes Deployments Fail: [Part 1](https://kukulinski.com/10-most-common-reasons-kubernetes-deployments-fail-part-1/) and [Part 2](https://kukulinski.com/10-most-common-reasons-kubernetes-deployments-fail-part-1/) 346 | 347 | ### Language or platform specific 348 | 349 | - [Debugging Microservices: How Google SREs Resolve Outages](https://www.infoq.com/presentations/google-debug-microservices) 350 | - [Debugging Microservices: Lessons from Google, Facebook, Lyft](https://thenewstack.io/debugging-microservices-lessons-from-google-facebook-lyft/) 351 | - [Troubleshooting Java applications on OpenShift](https://developers.redhat.com/blog/2017/08/16/troubleshooting-java-applications-on-openshift/) 352 | - Google Kubernetes Engine [Troubleshooting](https://cloud.google.com/kubernetes-engine/docs/troubleshooting) docs 353 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-dinky 2 | title: TKA 3 | description: Hands-on troubleshooting Kubernetes applications. 4 | show_downloads: false 5 | google_analytics: 6 | -------------------------------------------------------------------------------- /_layouts/default.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 7 | 8 | {% seo %} 9 | 10 | 13 |{{ site.description | default: site.github.project_tagline }}
25 |