├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── docs ├── container.md ├── golang.md ├── kubernetes.md ├── openshift.md ├── os.md ├── performance.md └── special.md ├── images ├── fdisk-extend-partition.png └── xfs_growfs.png └── tool ├── gh-md-toc └── update-toc.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | toc: 2 | @sh ./tool/update-toc.sh 3 | 4 | .PHONY: toc 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TOC 2 | * [Linux and OS](./docs/os.md) 3 | * [Container](./docs/container.md) 4 | * [Kubernetes](./docs/kubernetes.md) 5 | * [OpenShift](./docs/openshift.md) 6 | * [Golang](./docs/golang.md) 7 | * [Special Column](./docs/special.md) 8 | * [Performance](./docs/performance.md) 9 | -------------------------------------------------------------------------------- /docs/container.md: -------------------------------------------------------------------------------- 1 | # TOC 2 | 3 | 4 | * [TOC](#toc) 5 | * [cgroup](#cgroup) 6 | * [cgroup子系统](#cgroup子系统) 7 | * [cpu和cpuacct cgroup](#cpu和cpuacct-cgroup) 8 | * [根据pod的cpu request和limit如何设置cpu cgroup参数](#根据pod的cpu-request和limit如何设置cpu-cgroup参数) 9 | * [cfs_period_us和cfs_quota_us进一步解释](#cfsperiodus和cfsquotaus进一步解释) 10 | * [cpu.rt_runtime_us](#cpurtruntimeus) 11 | * [分析命令](#分析命令) 12 | * [cpuset](#cpuset) 13 | * [memory](#memory) 14 | * [devices](#devices) 15 | * [pids](#pids) 16 | * [挂载cgroupfs](#挂载cgroupfs) 17 | * [判断是否为cgroupv2](#判断是否为cgroupv2) 18 | * [降级为cgroupv1](#降级为cgroupv1) 19 | * [常用操作](#常用操作) 20 | * [使用systemd管理cgroup](#使用systemd管理cgroup) 21 | * [查看统计信息](#查看统计信息) 22 | * [namespaces](#namespaces) 23 | * [pid](#pid) 24 | * [找到host和容器内进程pid关系](#找到host和容器内进程pid关系) 25 | * [找到一个pidns下的进程](#找到一个pidns下的进程) 26 | * [mount](#mount) 27 | * [主机上查看和修改容器内文件](#主机上查看和修改容器内文件) 28 | * [net](#net) 29 | * [常用命令](#常用命令) 30 | * [常用工具](#常用工具) 31 | * [lsns](#lsns) 32 | * [nsenter](#nsenter) 33 | * [unshare](#unshare) 34 | * [OCI标准](#oci标准) 35 | * [运行时标准 runtime-spec](#运行时标准-runtime-spec) 36 | * [镜像标准 image-spec](#镜像标准-image-spec) 37 | * [manifest](#manifest) 38 | * [OCI Image Media Types](#oci-image-media-types) 39 | * [layer](#layer) 40 | * [blobs](#blobs) 41 | * [分发标准 distribution-spec](#分发标准-distribution-spec) 42 | * [容器镜像](#容器镜像) 43 | * [从无到有制作基础镜像](#从无到有制作基础镜像) 44 | * [采用合并打包实现缩容](#采用合并打包实现缩容) 45 | * [移除基础镜像层实现缩容](#移除基础镜像层实现缩容) 46 | * [使用buildx构建多架构容器镜像](#使用buildx构建多架构容器镜像) 47 | * [容器存储](#容器存储) 48 | * [overlay2](#overlay2) 49 | * [存储配额限制](#存储配额限制) 50 | * [容器可读可写层用量统计](#容器可读可写层用量统计) 51 | * [根据overlay数据目录digest反查容器/镜像](#根据overlay数据目录digest反查容器镜像) 52 | * [宿主机上直接修改容器内文件](#宿主机上直接修改容器内文件) 53 | * [容器安全](#容器安全) 54 | * [Discretionary Access Control](#discretionary-access-control) 55 | * [linux capabilities](#linux-capabilities) 56 | * [seccomp](#seccomp) 57 | * [原理](#原理) 58 | * [确认操作系统和容器运行时支持seccomp](#确认操作系统和容器运行时支持seccomp) 59 | * [为pod设置seccomp自定义策略](#为pod设置seccomp自定义策略) 60 | * [排错](#排错) 61 | * [runc加载seccomp策略时报“OCI permission denied”](#runc加载seccomp策略时报oci-permission-denied) 62 | * [AppArmor](#apparmor) 63 | * [使用AppArmor的前置条件](#使用apparmor的前置条件) 64 | * [selinux](#selinux) 65 | * [深入学习](#深入学习) 66 | * [一次完整的报错分析](#一次完整的报错分析) 67 | * [常用操作](#常用操作-1) 68 | * [为Pod/容器设置selinux label](#为pod容器设置selinux-label) 69 | * [根据审计日志设置selinux规则](#根据审计日志设置selinux规则) 70 | * [容器运行时](#容器运行时) 71 | * [runc](#runc) 72 | * [常用命令](#常用命令-1) 73 | * [cri-o如何通过conmon调用runc创建容器](#cri-o如何通过conmon调用runc创建容器) 74 | * [crun](#crun) 75 | * [OCI](#oci) 76 | * [oci-hooks](#oci-hooks) 77 | * [Containerd](#containerd) 78 | * [常用操作](#常用操作-2) 79 | * [如何编译containerd](#如何编译containerd) 80 | * [CRI-O](#cri-o) 81 | * [统计容器可读可写层存储用量](#统计容器可读可写层存储用量) 82 | * [指定seccomp profile](#指定seccomp-profile) 83 | * [容器存储目录](#容器存储目录) 84 | * [non-root用户使用devices](#non-root用户使用devices) 85 | * [检查容器存储数据量是否合理](#检查容器存储数据量是否合理) 86 | * [配置修改](#配置修改) 87 | * [修改容器内ulimit配置](#修改容器内ulimit配置) 88 | * [问题debug](#问题debug) 89 | * [看crio日志](#看crio日志) 90 | * [创建容器失败](#创建容器失败) 91 | * [Deep Dive](#deep-dive) 92 | * [创建容器](#创建容器) 93 | * [列出镜像ImageService/ListImages](#列出镜像imageservicelistimages) 94 | * [podman](#podman) 95 | * [配置管理](#配置管理) 96 | * [使用podman查看cri创建的pod](#使用podman查看cri创建的pod) 97 | * [容器镜像和overlay/layer对应关系](#容器镜像和overlaylayer对应关系) 98 | * [在login后podman的认证信息可能存放的几个地方](#在login后podman的认证信息可能存放的几个地方) 99 | * [创建manifest list支持多架构镜像](#创建manifest-list支持多架构镜像) 100 | * [使用podman统计容器镜像大小](#使用podman统计容器镜像大小) 101 | * [常用命令](#常用命令-2) 102 | * [crictl](#crictl) 103 | * [直接创建容器](#直接创建容器) 104 | * [创建Pod Sandbox](#创建pod-sandbox) 105 | * [创建业务容器](#创建业务容器) 106 | * [如何配置](#如何配置) 107 | * [查看容器资源用量](#查看容器资源用量) 108 | * [容器可读可写层存储占用top10](#容器可读可写层存储占用top10) 109 | * [容器可读可写层inode占用top10](#容器可读可写层inode占用top10) 110 | * [根据进程pid查询pod](#根据进程pid查询pod) 111 | * [根据pod的uid查询pod](#根据pod的uid查询pod) 112 | * [Docker](#docker) 113 | * [容器环境下的swap使用](#容器环境下的swap使用) 114 | * [深入docker stats命令](#深入docker-stats命令) 115 | * [Docker问题定位](#docker问题定位) 116 | * [Docker卡死hang住](#docker卡死hang住) 117 | * [Docker操作](#docker操作) 118 | * [常用操作](#常用操作-3) 119 | * [提取镜像rootfs文件](#提取镜像rootfs文件) 120 | * [docker build构建镜像](#docker-build构建镜像) 121 | * [安装指定版本docker](#安装指定版本docker) 122 | * [关闭docker0](#关闭docker0) 123 | * [修改容器的ulimit默认配置](#修改容器的ulimit默认配置) 124 | * [使用docker-storage-setup初始化docker存储](#使用docker-storage-setup初始化docker存储) 125 | * [构建Docker镜像最佳实践(Alpine)](#构建docker镜像最佳实践alpine) 126 | * [强制删除容器](#强制删除容器) 127 | * [找到容器使用的dm-xx设备](#找到容器使用的dm-xx设备) 128 | * [docker pull加速](#docker-pull加速) 129 | * [docker使用代理](#docker使用代理) 130 | * [容器文件系统使用率统计](#容器文件系统使用率统计) 131 | * [强制重启Docker服务](#强制重启docker服务) 132 | * [镜像仓库和工具](#镜像仓库和工具) 133 | * [skopeo](#skopeo) 134 | * [镜像搬运工](#镜像搬运工) 135 | * [Windows环境上源码运行skopeo搬运镜像](#windows环境上源码运行skopeo搬运镜像) 136 | 137 | 138 | # cgroup 139 | 140 | cgroup的原生接口通过cgroupfs提供,类似于procfs和sysfs,是一种虚拟文件系统,用户可以通过文件操作实现cgroup的组织管理。 141 | 142 | cgroup可以限制、记录、隔离进程组所使用的物理资源。 143 | 144 | 子进程创建之初,与其父进程处于同一个cgroup的控制组里。 145 | 146 | cgroup实现本质上是给系统进程挂上hooks,当task运行过程中涉及到某类资源的使用时就会触发hook上附带的子系统进行检测。 147 | 148 | 主要作用包括: 149 | 150 | - 资源限制:可以对进程组使用的资源总额进行限制(例如内存上限,一旦超过配额就触发OOM异常) 151 | - 优先级分配:通过分配的CPU时间片数量及硬盘IO带宽大小,相当于控制进程运行的优先级 152 | - 资源统计:统计系统的资源使用量,如CPU使用时长、内存用量等,非常适用于计费和监控 153 | - 进程控制:对进程组执行挂起、恢复等操作 154 | 155 | ## cgroup子系统 156 | 157 | | 类型 | 说明 | 158 | |------------|-------------------------------------------------------------------------------| 159 | | cpuset | 为cgroup中的task分配独立的cpu(针对多处理器系统)和内存 | 160 | | cpu | 控制task对cpu的使用 | 161 | | cpuacct | 自动生成cgroup中task对cpu资源使用情况的报告 | 162 | | memory | 设定cgroup中task对内存使用量的限定,并且自动生成这些task对内存资源使用情况的报告 | 163 | | blkio | 为块设备设定输入/输出限制 | 164 | | devices | 开启或关闭cgroup中task对设备的访问 | 165 | | freezer | 挂起或恢复cgroup中的task | 166 | | net_cls | docker没有直接使用,其通过使用等级识别符(classid)标记网络数据包,从而允许Linux流量控制(TC)程序识别从具体cgroup中生成的数据包 | 167 | | perf_event | 对cgroup中的task进行统一的性能测试 | 168 | | hugetlb | TODO | 169 | 170 | ### cpu和cpuacct cgroup 171 | | 配置 | 说明 | 172 | |----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 173 | | cpu.cfs_burst_us | CFS调度器,允许在一个period内,cpu资源用量超过quota限制,预支的部分在后面的period里扣减出去。 | 174 | | cpu.cfs_period_us | cfs周期,单位微秒,默认值100000 | 175 | | cpu.cfs_quota_us | 用以配置在当前cfs周期下能够获取的调度配额,单位微秒,如果给95%个核则配置95000,如果给5个核则配置500000,默认值-1表示不受限 | 176 | | cpu.shares | 各cgroup共享cpu的权重值,默认1024,闲时cpu用量能超过根据权重计算的共享比例,忙时根据共享比例分配cpu资源 | 177 | | cpu.stat | **nr_periods**, 表示过去了多少个cpu.cfs_period_us里面配置的时间周期
**nr_throttled**, 在上面的这些周期中,有多少次是受到了限制(即cgroup中的进程在指定的时间周期中用光了它的配额)
**throttled_time**, cgroup中的进程被限制使用CPU持续了多长时间(纳秒) | 178 | | cpu.idle | | 179 | | cpuacct.usage | 所有cpu核的累加使用时间(nanoseconds) | 180 | | cpuacct.usage_percpu | 针对多核,输出的是每个CPU的使用时间(nanoseconds) | 181 | | cpuacct.stat | 输出系统(system/kernel mode)耗时和用户(user mode)耗时,单位为USER_HZ。 | 182 | 183 | `cpu.shares`用于设置下限,在cpu繁忙时生效。`cpu.cfs_period_us`和`cpu.cfs_quota_us`设置硬上限。 184 | 185 | 参见: 186 | - [限制cgroup的CPU使用(subsystem之cpu)](https://segmentfault.com/a/1190000008323952) 187 | - [CFS Bandwidth Control](https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt) 188 | - [Linux cgroup资源隔离各个击破之 - cpu隔离1](https://developer.aliyun.com/article/54483) 189 | - [CFS Scheduler](https://www.kernel.org/doc/Documentation/scheduler/sched-design-CFS.txt) 190 | - [What is the relationship between cpu.shares and cpu.cfs_quota_us in context of cgroup?](https://stackoverflow.com/questions/55901070/what-is-the-relationship-between-cpu-shares-and-cpu-cfs-quota-us-in-context-of-c) 191 | 192 | #### 根据pod的cpu request和limit如何设置cpu cgroup参数 193 | 建一个测试Pod,其`resources`配置如下: 194 | ```bash 195 | resources: 196 | requests: 197 | cpu: 0 198 | limits: 199 | cpu: 1 200 | ``` 201 | 202 | 创建Pod后可确认: 203 | * **调度效果**:对request没有要求,不会占节点的allocated request数。 204 | * **QoS类型**:Burstable 205 | 206 | 进一步查看cpu cgroup参数: 207 | ```bash 208 | # cat cpu.cfs_period_us 209 | 100000 210 | # cat cpu.cfs_quota_us 211 | 100000 212 | # cat cpu.shares 213 | 2 214 | ``` 215 | * 可看到 _cpu.cfs_quota_us_ / _cpu.cfs_period_us_ 为1,这个是上限。 216 | * *cpu.shares*为2,而一个核的权重为1024,因此2/1024近乎为0,可看到下限配置很低,对应`request 0`。 217 | 218 | 219 | 作为对比,更新测试Pod的`resources`配置如下: 220 | ```bash 221 | resources: 222 | requests: 223 | cpu: 0.5 224 | limits: 225 | cpu: 1.5 226 | ``` 227 | 这时cpu cgroup参数如下: 228 | ```bash 229 | # cat cpu.cfs_period_us 230 | 100000 231 | # cat cpu.cfs_quota_us 232 | 150000 233 | # cat cpu.shares 234 | 512 235 | ``` 236 | * 可看到 _cpu.cfs_quota_us_ / _cpu.cfs_period_us_ 为1.5,这个是上限。 237 | * *cpu.shares* / 1024 为0.5,对应`request 0.5`。 238 | 239 | #### cfs_period_us和cfs_quota_us进一步解释 240 | period为100000、quota为50000和period为10000、quota为5000,容器的cpu limit均为0.5核,有什么区别? 241 | * 每个period内,最多执行quota时间。如果在quota时间内未执行完,将被throttle(统计到stat里),并只能等待下一个period继续执行。 242 | * period越大,整体吞吐能力越好、削峰效果越好,但会导致实时性变差,反之亦然。 243 | 244 | 辅以cfs_burst_us,能既获取良好的吞吐能力,又兼顾实时性,具体的: 245 | * CFS调度器,允许在一个period内,cpu资源用量超过quota限制,预支的部分在后面的period里扣减出去。 246 | 247 | #### cpu.rt_runtime_us 248 | 内核需要配置上`CONFIG_RT_GROUP_SCHED`。 249 | 250 | #### 分析命令 251 | ```bash 252 | # CPU周期次数 253 | old=0 254 | new=0 255 | while true; do 256 | new=$(cat cpu.stat | grep nr_periods | awk '{print $2}') 257 | delta=$((new-old)) 258 | echo "$(date) $delta" 259 | old=$new 260 | sleep 1s 261 | done 262 | 263 | # CPU限速次数 264 | old=0 265 | new=0 266 | while true; do 267 | new=$(cat cpu.stat | grep nr_throttled | awk '{print $2}') 268 | delta=$((new-old)) 269 | echo "$(date) $delta" 270 | old=$new 271 | sleep 1s 272 | done 273 | 274 | # 单个容器徒手实现 crictl stats --seconds 10 得效果 275 | cid=xxx 276 | while true; do 277 | start=$(crictl stats -o json $cid | jq -r '.stats[0].cpu | .timestamp + " " + .usageCoreNanoSeconds.value') 278 | sleep 10s 279 | finished=$(crictl stats -o json $cid | jq -r '.stats[0].cpu | .timestamp + " " + .usageCoreNanoSeconds.value') 280 | 281 | ts_start=$(echo $start | cut -d' ' -f1) 282 | usage_start=$(echo $start | cut -d' ' -f2) 283 | ts_finished=$(echo $finished | cut -d' ' -f1) 284 | usage_finished=$(echo $finished | cut -d' ' -f2) 285 | 286 | ts_delta=$((ts_finished - ts_start)) 287 | usage_delta=$((usage_finished - usage_start)) 288 | usage=$(echo $(awk -v usage_delta="$usage_delta" -v ts_delta="$ts_delta" 'BEGIN {print usage_delta * 100 / ts_delta}') | cut -d. -f1) 289 | 290 | echo "$(date) $usage%" 291 | done 292 | ``` 293 | 294 | ### cpuset 295 | 遍历所有kubernetes pod的cpu亲和性: 296 | ```bash 297 | for f in $(find /sys/fs/cgroup/cpuset -name "cpuset.cpus"); do printf "%-220s %s\n" $f $(cat $f); done 298 | ``` 299 | 300 | ### memory 301 | TODO: cgroup v1的oom,文件缓存*file_dirty* 和 *file_writeback* 的内存用量,这部分也记到容器内存,可能导致oom。 302 | 参见链接[cgroup-v2](https://docs.kernel.org/admin-guide/cgroup-v2.html) 。 303 | 304 | 其它相关说明: 305 | * 系统参数`vm.dirty*`,参见[更加积极的脏页缓存刷新](./os.md#更加积极的脏页缓存刷新) 。针对大内存节点,调优 vm.dirty 参数,更加积极的脏数据刷新,避免脏页积累导致的容器内 file_dirty 和 file_writeback 过大、容器OOM。 306 | * 读写文件时*Direct I/O*参数,即`O_DIRECT`标识,避免文件系统缓存,不过相应的带来IO性能降低。 307 | * cgroupv2会限制内存group中pagecache内存用量,因此能避免上述oom。 308 | * cgroup oom时,failcnt为什么会出现很大的情况?也就是说,分配内存失败,要失败很多次才触发oom-kill,那这里的策略是什么 309 | > failcnt数值反映的是触发limit的次数,memcg每次申请内存的时候都会先判断本次申请是否会超limit,如果会超,则failcnt++(此时并没有真正去申请), 310 | > 后续流程中会尝试try_to_free_mem_cgroup_pages,尝试回收,但是力度很小(主要是inode/dentry cache,vm.vfs_cache_pressure这个参数影响回收力度), 311 | > 最终如果不够用,就触发oom,够用则只是增加了failcnt统计。 312 | 313 | ### devices 314 | ```bash 315 | echo "b 7:0 rwm" > /sys/fs/cgroup/devices/kubepods.slice/devices.allow 316 | ``` 317 | 318 | ### pids 319 | 可用于控制容器的进程数: 320 | ``` 321 | pids.current pids.events pids.max 322 | ``` 323 | 324 | 检查pids数TOP20: 325 | ```bash 326 | for p in $(find /sys/fs/cgroup/pids/ -name "pids.current"); do echo "$(cat $p) $p"; done | sort -rn | head -n20 327 | ``` 328 | 329 | ## 挂载cgroupfs 330 | 331 | 以cpuset子系统为例: 332 | 333 | ```bash 334 | mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset 335 | ``` 336 | 337 | ## 判断是否为cgroupv2 338 | ```bash 339 | mkdir /tmp/hehe 340 | # 看能否挂载成功 341 | mount -t cgroup2 none /tmp/hehe 342 | 343 | # 另一种方法,看能否搜索到 cgroup2 344 | grep cgroup /proc/filesystems 345 | ``` 346 | 347 | ## 降级为cgroupv1 348 | 主机上修改grub配置,重启主机生效: 349 | ```bash 350 | sudo grubby --update-kernel=ALL --args="systemd.unified_cgroup_hierarchy=0" 351 | ``` 352 | 353 | ## 常用操作 354 | 355 | ### 使用systemd管理cgroup 356 | ```bash 357 | systemd-cgls # 查看systemd cgroup的配置层级关系 358 | systemd-cgtop # 基于cgroup,直接查看cpu和内存的使用情况 359 | ``` 360 | 361 | ```bash 362 | mount -t cgroup 363 | lssubsys -m 364 | ls -l /sys/fs/cgroup/ 365 | lscgroup 366 | man cgconfig.conf 367 | cgcreate 368 | cgdelete 369 | ``` 370 | 371 | 372 | ### 查看统计信息 373 | ```bash 374 | # 全量统计信息 375 | cat /proc/cgroups 376 | 377 | ``` 378 | 379 | 380 | # namespaces 381 | 382 | 进一步阅读: 383 | * [The 7 most used Linux namespaces](https://www.redhat.com/sysadmin/7-linux-namespaces) 384 | * [Building a Linux container by hand using namespaces](https://www.redhat.com/sysadmin/building-container-namespaces) 385 | 386 | ## pid 387 | 388 | ### 找到host和容器内进程pid关系 389 | host上,查看进程的status文件,可看到其在容器内的pid: 390 | ```bash 391 | # cat /proc//status | grep NSpid 392 | NSpid: 12345 2 393 | ``` 394 | 其中第1列是进程在host上的pid,第2列是容器内的pid。 395 | 396 | 进一步阅读: 397 | * [Building containers by hand: The PID namespace](https://www.redhat.com/sysadmin/pid-namespace) 398 | 399 | ### 找到一个pidns下的进程 400 | ```bash 401 | # 找到一个process的pid命名空间(inode),适用于容器内或者host上执行 402 | ls -Li /proc//ns/pid 403 | # 也可以列出全部的pid命名空间 404 | lsns -t pid 405 | 406 | # host上遍历寻找所有该pid命名空间下的进程,其中xxxxxxxxxx是pidns的inode 407 | ps -eo pidns,pid,lwp,cmd | awk '$1==xxxxxxxxxx' 408 | ``` 409 | 410 | ## mount 411 | 进一步阅读: 412 | * [Building a container by hand using namespaces: The mount namespace](https://www.redhat.com/sysadmin/mount-namespaces) 413 | 414 | ### 主机上查看和修改容器内文件 415 | ```bash 416 | nsenter -t $(pidof xxx) -m ls 417 | nsenter -t $(pidof xxx) -m vi /path/to/file 418 | ``` 419 | 420 | ## net 421 | ``` 422 | # 使用 ip netns 命令遍历所有网络命名空间,并执行配置 423 | ip netns ls | awk '{print $1}' | xargs -I {} ip netns exec {} sysctl -w net.ipv4.vs.foo=bar 424 | ``` 425 | 426 | ## 常用命令 427 | ```bash 428 | # 查看ns的inode信息 429 | ls -Li /proc/1/ns/net 430 | # TODO: https://unix.stackexchange.com/questions/113530/how-to-find-out-namespace-of-a-particular-process 431 | 432 | # 查看pid所述的容器/pod 433 | nsenter -t ${pid} -u hostname 434 | 435 | # 查看pid所在容器的内存用量 436 | nsenter -t ${pid} -m cat /sys/fs/cgroup/memory/memory.usage_in_bytes 437 | 438 | # 查看pid所在容器的cpu使用率(近10秒) 439 | function cpu-usage { 440 | local pid=$1 441 | local start=$(nsenter -t ${pid} -m cat /sys/fs/cgroup/cpu/cpuacct.usage 2>/dev/null) 442 | sleep 10s 443 | local end=$(nsenter -t ${pid} -m cat /sys/fs/cgroup/cpu/cpuacct.usage 2>/dev/null) 444 | if [ "${start}" != "" ] && [ "${end}" != "" ]; then 445 | # echo "(${end} - ${start}) / 100000000" | bc 446 | local cpuacct=$[${end} - ${start}] 447 | echo $[${cpuacct}/100000000]% 448 | fi 449 | } 450 | ``` 451 | 452 | ## 常用工具 453 | 454 | ### lsns 455 | 456 | `lsns`工具来自包`util-linux`,其常见使用如下: 457 | 458 | ```bash 459 | # 查看网络命名空间列表 460 | lsns -t net 461 | 462 | # 查看pid命名空间列表 463 | lsns -t pid 464 | ``` 465 | 466 | 467 | 468 | ### nsenter 469 | 470 | ```bash 471 | nsenter --net=/proc/19714/ns/net ip addr 472 | nsenter -t 19714 -u hostname 473 | nsenter -t 19714 -m -u -i -n -p bash 474 | nsenter -t 19714 -m -p bash 475 | nsenter -t 12472 -m -p umount /var/lib/origin/openshift.local.volumes/pods//volumes/ctriple.cn~drbd/r0002 476 | nsenter -t 19714 -m -p ps -ef 477 | nsenter -t ${pid} -m cat /sys/devices/virtual/net/eth0/iflink 2>/dev/null 478 | nsenter -t 7429 -n cat /proc/net/route 479 | nsenter -t 12345 -n tcpdump -i eth0 -nnl # 关联容器的网络命名空间,直接在宿主机上抓容器里eth0接口的报文 480 | nsenter -t 14756 -n ip link set eth0 address ee:ee:ee:ee:ee:ee # 修改容器 MAC 地址 481 | ``` 482 | 483 | 484 | 485 | ### unshare 486 | 487 | 使用不同的命名空间运行程序,详见`man 1 unshare` 488 | 489 | >run program with some namespaces unshared from parent 490 | 491 | 492 | # OCI标准 493 | ## 运行时标准 runtime-spec 494 | 495 | ## 镜像标准 image-spec 496 | 标准链接[image-spec](https://github.com/opencontainers/image-spec/blob/main/spec.md) 497 | 498 | ### manifest 499 | 包括如下信息: 500 | * **layer**, that will be unpacked to make up the final runnable filesystem. 501 | * **image config**, includes information such as application arguments, environments, etc. 502 | * **image index**, a higher-level manifest which points to a list of manifests and descriptors. 503 | 504 | 最终能够`unpacked into an OCI Runtime Bundle.` 505 | 506 | ### OCI Image Media Types 507 | https://github.com/opencontainers/image-spec/blob/main/media-types.md 508 | 509 | ### layer 510 | 511 | 512 | ### blobs 513 | ```bash 514 | $ shasum -a 256 ./blobs/sha256/afff3924849e458c5ef237db5f89539274d5e609db5db935ed3959c90f1f2d51 515 | afff3924849e458c5ef237db5f89539274d5e609db5db935ed3959c90f1f2d51 ./blobs/sha256/afff3924849e458c5ef237db5f89539274d5e609db5db935ed3959c90f1f2d51 516 | ``` 517 | 518 | ## 分发标准 distribution-spec 519 | 520 | # 容器镜像 521 | ## 从无到有制作基础镜像 522 | 比如制作一个CentOS操作系统的基础镜像,使用CentOS的yum源即可: 523 | ```bash 524 | mkdir -p /tmp/test/baseimage 525 | # 往/tmp/test/baseimage这个目录安装bash和yum,过程中会自动解决依赖 526 | yum -c /etc/yum.conf --installroot=/tmp/test/baseimage --releasever=/ install bash yum 527 | 528 | # 进入目录可以看到rootfs 529 | [root@xxx baseimage]# ls 530 | bin boot dev etc home lib lib64 media mnt opt proc root run sbin srv sys tmp usr var 531 | # 此时,可以手动修改rootfs中的文件,例如修改etc/yum.repos.d目录下*.repo,定制仓库路径 532 | 533 | # 生成并上传基础镜像 534 | tar --numeric-owner -c -C "/tmp/test/baseimage" . | docker import - docker.io/ytinirt/baseimage:v1 535 | docker push docker.io/ytinirt/baseimage:v1 536 | ``` 537 | 538 | ## 采用合并打包实现缩容 539 | TODO 540 | 541 | ## 移除基础镜像层实现缩容 542 | 在无法合并打包时,可采用移除基础镜像层的方式实现应用镜像的缩容。 543 | 544 | 大致原理为,确保目的地容器存储中已存在基础镜像,可将应用镜像中包含于基础镜像的layer删除并重新打包应用镜像,实现应用镜像缩容的目的。 545 | 传输到目的地,加载镜像时,虽然应用镜像tar包中没有基础镜像layer,但目的地容器存储中已存在对应的基础layer,因此应用镜像也能加载成功。 546 | 547 | ## 使用buildx构建多架构容器镜像 548 | 参考资料: 549 | - https://docs.docker.com/buildx/working-with-buildx/ 550 | - https://medium.com/@artur.klauser/building-multi-architecture-docker-images-with-buildx-27d80f7e2408 551 | - https://github.com/docker/buildx 552 | - https://github.com/docker/buildx/issues/80 553 | 554 | 环境要求: 555 | - 内核版本:4.8及以上(如果用CentOS,建议直接装CentOS 8) 556 | - Docker版本: 19.03及以上(要使用buildx,19.x版本可能需要开启docker Experimental mode。而20.10.8已默认开启buildx命令。建议使用最新版本的Docker) 557 | 558 | 环境准备和Demo 559 | ```bash 560 | # 通过容器方式,准备多架构编译环境(注意,节点重启后需要重新run一次容器) 561 | docker run --rm --privileged multiarch/qemu-user-static --reset -p yes 562 | 563 | # 创建并使用builder 564 | docker buildx create --use --name mybuilder --driver-opt network=host 565 | # 此处使用主机网络"network=host",能用到宿主机/etc/hosts,是为了解决私有仓库域名解析的问题 566 | 567 | # 检查builder,并触发其准备就绪,实际上就是启一个buidler容器 568 | docker buildx inspect --bootstrap 569 | 570 | # 拷贝为私有仓库签发证书的CA的证书到builder容器,并重启builder容器,解决私有仓库证书问题 571 | BUILDER_ID=$(docker ps|grep 'moby/buildkit' | awk '{print $1}') 572 | docker cp ${BUILDER_ID}:/etc/ssl/certs/ca-certificates.crt 573 | docker restart ${BUILDER_ID} 574 | 575 | # 查看builder,已支持多种架构 576 | docker buildx ls 577 | # 类似如下输出,可看到支持多种架构 578 | # NAME/NODE DRIVER/ENDPOINT STATUS PLATFORMS 579 | # mybuilder * docker-container 580 | # mybuilder0 unix:///var/run/docker.sock running linux/amd64, linux/arm64, linux/riscv64, linux/ppc64le, linux/s390x, linux/386, linux/mips64le, linux/mips64, linux/arm/v7, linux/arm/v6 581 | 582 | # 准备镜像的Dockerfile和依赖资源文件,例如 583 | cat << EOF > Dockerfile 584 | FROM alpine:latest 585 | CMD echo "Running on $(uname -m)" 586 | EOF 587 | 588 | # 登录镜像仓库 589 | 590 | # 构建多架构镜像,并自动以manifest list方式push到镜像仓库 591 | docker buildx build -t "ytinirt/buildx-test:latest" --platform linux/amd64,linux/arm64 --push . 592 | 593 | # 查看镜像 594 | docker manifest inspect ytinirt/buildx-test:latest 595 | 596 | # 可选:删除builder,什么都没发生过 597 | docker buildx rm mybuilder 598 | ``` 599 | 600 | 601 | # 容器存储 602 | 603 | ## overlay2 604 | 605 | ### 存储配额限制 606 | 参见[storage-driver-options](https://docs.docker.com/engine/reference/commandline/dockerd/#storage-driver-options)。即使采用overlay2存储驱动,也可以借助xfs的pquota特性,为容器rw层做限制。 607 | > overlay2.size 608 | > 609 | > Sets the default max size of the container. It is supported only when the backing fs is xfs and mounted with pquota mount option. Under these conditions the user can pass any size less than the backing fs size. 610 | 611 | 更进一步,通过`xfs`文件系统的`pquota`属性,可以实现文件夹级别的存储配额限制。 612 | 613 | ### 容器可读可写层用量统计 614 | ```bash 615 | # 进入overlay的数据目录 616 | cd /var/lib/containers/storage/overlay 617 | # 统计容器可读可写层新增文件大小统计排序 618 | for d in $(find . -name "diff" -type d -maxdepth 2 2>/dev/null); do du -sh $d 2>/dev/null; done | grep -v ^0 | grep -v K | sort -n 619 | ``` 620 | 621 | ### 根据overlay数据目录digest反查容器/镜像 622 | ```bash 623 | for cid in $(crictl ps -a -q ); do echo $cid; crictl inspect $cid | grep ; done 624 | for cid in $(podman ps -aq); do echo $cid; podman inspect $cid | grep ; done 625 | for iid in $(crictl img | sed 1d | awk '{print $3}'); do echo $iid; crictl inspecti $iid | grep ; done 626 | 627 | ``` 628 | 629 | ## 宿主机上直接修改容器内文件 630 | 631 | 宿主机上`/proc/{pid}/cwd`是pid所在进程当前的工作路径,如果pid是容器中业务进程在宿主机上的进程号,那么cwd文件夹中能直接看到容器中“当前工作目录”。 632 | 因此,宿主机上直接修改cwd文件夹中的内容,也能在容器中生效。 633 | 634 | 635 | # 容器安全 636 | 637 | 参考文档: 638 | 639 | - [Overview Of Linux Kernel Security Features](https://www.linux.com/tutorials/overview-linux-kernel-security-features/) 640 | - [Configure a Security Context for a Pod or Container](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/) 641 | - [Pod Security Policies](https://kubernetes.io/docs/concepts/policy/pod-security-policy/) 642 | 643 | ## Discretionary Access Control 644 | 645 | 通过user ID (UID)和group ID (GID),实行访问控制。 646 | 647 | 为Pod/容器的安全上下文securityContext设置uid和gid: 648 | 649 | ~~~yaml 650 | apiVersion: v1 651 | kind: Pod 652 | metadata: 653 | name: security-context-demo 654 | spec: 655 | securityContext: 656 | runAsUser: 1000 657 | runAsGroup: 3000 658 | fsGroup: 2000 659 | volumes: 660 | - name: sec-ctx-vol 661 | emptyDir: {} 662 | containers: 663 | - name: sec-ctx-demo 664 | image: busybox 665 | command: [ "sh", "-c", "sleep 1h" ] 666 | volumeMounts: 667 | - name: sec-ctx-vol 668 | mountPath: /data/demo 669 | securityContext: 670 | runAsUser: 2000 671 | allowPrivilegeEscalation: false 672 | ~~~ 673 | 674 | 其中fsGroup施加到volume上,修改volume下文件/文件夹的GID。 675 | 676 | 677 | 678 | ## linux capabilities 679 | 680 | 定义文档参见[capability.h](https://github.com/torvalds/linux/blob/master/include/uapi/linux/capability.h) 681 | 682 | 查看当前进程的capabilities 683 | 684 | ~~~bash 685 | # cat /proc/$$/status | grep Cap 686 | CapInh: 0000000000000000 687 | CapPrm: 0000003fffffffff 688 | CapEff: 0000003fffffffff 689 | CapBnd: 0000003fffffffff 690 | CapAmb: 0000000000000000 691 | ~~~ 692 | 693 | 为Pod设置capabilities 694 | 695 | ~~~yaml 696 | apiVersion: v1 697 | kind: Pod 698 | metadata: 699 | name: security-context-demo 700 | spec: 701 | containers: 702 | - name: sec-ctx 703 | image: gcr.io/google-samples/node-hello:1.0 704 | securityContext: 705 | capabilities: 706 | add: ["SYS_TIME", "SYS_ADMIN"] 707 | ~~~ 708 | * 增加`SYS_ADMIN`,容器内能够`mount`操作。 709 | * 增加`SYS_TIME`,容器内能够设置系统时间。 710 | 711 | 712 | 注意,在add和drop时,去掉了前缀`CAP_`。 713 | 714 | 进一步[阅读](https://cloud.redhat.com/blog/linux-capabilities-in-openshift) 。 715 | 716 | ## seccomp 717 | 718 | ### 原理 719 | 参考资料[seccomp](https://docs.docker.com/engine/security/seccomp) 720 | 721 | SECure COMPuting mode (简称seccomp)是Linux内核一种特性(Linux kernel feature)。能够过滤系统调用(Filter a process’s system calls)。 722 | 相较linux capabilities,权限控制粒度更细。 723 | 利用seccomp特性,Docker能够限制容器中能够访问的系统调用(system call),防止容器中的操作危害整个节点。 724 | 725 | 726 | ### 确认操作系统和容器运行时支持seccomp 727 | 通过如下操作,确认Linux和Docker支持seccomp: 728 | ```bash 729 | [root@zy-super-load docker]# docker info 730 | ... 731 | Security Options: 732 | seccomp 733 | WARNING: You're not using the default seccomp profile 734 | Profile: /etc/docker/seccomp.json 735 | selinux 736 | Kernel Version: 3.10.0-862.14.4.el7.x86_64 737 | ... 738 | [root@zy-super-load docker]# grep 'CONFIG_SECCOMP=' /boot/config-$(uname -r) 739 | CONFIG_SECCOMP=y 740 | ``` 741 | 742 | 从上述docker info中看到,docker的seccomp配置文件路径为`/etc/docker/seccomp.json`。 743 | 该配置文件采用白名单模式,即容器内可访问seccomp.json列出的系统调用,除此之外的系统调用无法访问,默认(SCMP_ACT_ERRNO)返回Permission Denied。 744 | 745 | ### 为pod设置seccomp自定义策略 746 | 以设置系统时间为例: 747 | ~~~bash 748 | [root@zy-super-load ~]# strace date -s "15:22:00" 2>&1| grep -i time 749 | ... 750 | clock_settime(CLOCK_REALTIME, {1575530520, 0}) = 0 751 | ... 752 | ~~~ 753 | 754 | 其用到了系统调用`clock_settime`。 755 | 为Pod设置seccomp profile 756 | ```yaml 757 | apiVersion: v1 758 | kind: ReplicationController 759 | ... 760 | spec: 761 | replicas: 1 762 | selector: 763 | app: seccomp-demo 764 | template: 765 | metadata: 766 | annotations: 767 | seccomp.security.alpha.kubernetes.io/pod: "localhost/test-profile.json" 768 | labels: 769 | app: seccomp-demo 770 | spec: 771 | containers: 772 | - command: 773 | - /bin/bash 774 | ... 775 | ``` 776 | 当指定为`localhost`时,默认从`/var/lib/kubelet/seccomp/`中搜索profile文件,详见`kubelet`的`--seccomp-profile-root`参数。 777 | 当`test-profile.json`中禁止系统调用`clock_settime`后,在pod中使用date设置系统时间失败。 778 | 779 | ### 排错 780 | 781 | #### runc加载seccomp策略时报“OCI permission denied” 782 | 详见[issue](https://github.com/containers/common/issues/631) 783 | 784 | ## AppArmor 785 | https://kubernetes.io/docs/tutorials/security/apparmor/ 786 | 787 | AppArmor通过调整配置文件(Profile)进行策略配置,以允许特定程序或容器所需的访问, 如 Linux 权能字、网络访问、文件权限等。 788 | 每个Profile都可以在 强制(enforcing) 模式(阻止访问不允许的资源)或 投诉(complain) 模式(仅报告冲突)下运行。 789 | 790 | AppArmor的Profile施加到Pod的每个容器上,具体的,通过Pod的注解指定容器及其使用的Profile,注解示例如下: 791 | ``` 792 | container.apparmor.security.beta.kubernetes.io/: 793 | ``` 794 | 795 | ### 使用AppArmor的前置条件 796 | 1. 检查是否开启AppArmor内核模块 797 | ```bash 798 | # 输出为Y 799 | cat /sys/module/apparmor/parameters/enabled 800 | # 或者 801 | cat /boot/config-$(uname -r) | grep CONFIG_SECURITY_APPARMOR 802 | ``` 803 | 2. 容器运行时支持AppArmor,主流的容器运行时,例如containerd和cri-o,均支持AppArmor 804 | 3. AppArmor的Profile文件已加载,如果Profile文件未加载,kubelet将拒绝创建使用该Profile的Pod 805 | ```bash 806 | # 查看已加载的Profile文件 807 | cat /sys/kernel/security/apparmor/profiles | sort 808 | ``` 809 | 810 | ## selinux 811 | 812 | 参考资料[HowTos/SELinux](https://wiki.centos.org/HowTos/SELinux) 813 | 814 | SELinux是对文件(file)和资源(例如process、device等)的访问权限控制,是对传统的discretionary access control (DAC) 的补充。 815 | SELinux参照最小权限模型(the model of least-privilege)设计,与之匹配的是严格策略(the strict policy),除非显式配置指定否则默认情况下所有访问均被拒绝(denied)。 816 | 但strict policy过于严格、不便使用,为此CentOS定义并默认采用基于目标的策略(the targeted policy),只针对选取的系统进程进行限制,这些进程(例如 httpd, named, dhcpd, mysqld)涉及敏感信息和操作。其它系统进程和用户进程则处于未限制域(unconfined domain)中,不由SELinux控制和保护。 817 | 818 | targeted policy有四种形式的访问控制: 819 | 820 | | 类型 | 描述 | 821 | |----------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------| 822 | | Type Enforcement (TE) | Type Enforcement is the primary mechanism of access control used in the targeted policy | 823 | | Role-Based Access Control (RBAC) | Based around SELinux users (not necessarily the same as the Linux user), but not used in the default configuration of the targeted policy | 824 | | Multi-Level Security (MLS) | Not commonly used and often hidden in the default targeted policy | 825 | | Multi-Category Security(MCS) | An extension of Multi-Level Security, used in the targeted policy to implement compartmentalization of virtual machines and containers through sVirt | 826 | 827 | 所有进程和文件都含有SELinux安全啥下文(SELinux security context)信息 828 | ```bash 829 | [root@op-master containers]# pwd 830 | /var/lib/docker/containers 831 | [root@op-master containers]# docker ps | grep nginx 832 | ... 833 | 6b312ef59368 nginx:1.14-alpine "nginx -g 'daemon ..." 4 days ago Up 4 days 80/tcp, 0.0.0.0:8888->8888/tcp apiserver-proxy 834 | [root@op-master containers]# cd 6b312ef59368/ 835 | [root@op-master 6b312ef59368]# ls -Z config.v2.json 836 | -rw-r--r--. root root system_u:object_r:container_var_lib_t:s0 config.v2.json 837 | [root@op-master 6b312ef59368]# 838 | ``` 839 | 其中,`system_u:object_r:container_var_lib_t:s0`就是在标准的DAC上增加的SELinux安全上下文信息。格式为`user:role:type:mls`,因此类型为`container_var_lib_t`。 840 | 841 | ```bash 842 | [root@op-master ~]# ps -efZ | grep 6b312ef593 843 | system_u:system_r:container_runtime_t:s0 root 22190 18571 0 Apr12 ? 00:00:38 /usr/bin/docker-containerd-shim-current 6b312ef59368 /var/run/docker/libcontainerd/6b312ef59368 /usr/libexec/docker/docker-runc-current 844 | ``` 845 | 可看到该容器的shim进程SELinux安全上下文,标识该进程类型为`container_runtime_t`,与上述config.v2.json文件的类型`container_var_lib_t`类似、均属于container_t域下,因此shim进程可以访问该文件。 846 | 847 | 848 | ### 深入学习 849 | TODO: 850 | * https://blog.csdn.net/xsm666/article/details/81357363 851 | * https://danwalsh.livejournal.com/81756.html 852 | 853 | 854 | ### 一次完整的报错分析 855 | ``` 856 | Apr 01 09:43:22 master0 setroubleshoot[1417162]: SELinux is preventing /usr/sbin/xtables-nft-multi from ioctl access on the directory /sys/fs/cgroup. For complete SELinux messages run: sealert -l e1a4eb18-019a-4552-bd0c-4706ada83ab9 857 | Apr 01 09:43:22 master0 setroubleshoot[1417162]: SELinux is preventing /usr/sbin/xtables-nft-multi from ioctl access on the directory /sys/fs/cgroup. 858 | 859 | ***** Plugin catchall (100. confidence) suggests ************************** 860 | 861 | If you believe that xtables-nft-multi should be allowed ioctl access on the cgroup directory by default. 862 | Then you should report this as a bug. 863 | You can generate a local policy module to allow this access. 864 | Do 865 | allow this access for now by executing: 866 | # ausearch -c 'iptables' --raw | audit2allow -M my-iptables 867 | # semodule -X 300 -i my-iptables.pp 868 | 869 | Apr 01 09:43:22 master0 setroubleshoot[1417162]: AnalyzeThread.run(): Set alarm timeout to 10 870 | ``` 871 | 872 | ### 常用操作 873 | ```bash 874 | # 设置SELinux模式 875 | setenforce 0 876 | 877 | # 查询当前SELinux模式 878 | getenforce 879 | 880 | # 查看SELinux状态 881 | sestatus 882 | 883 | # 设置具体elements的SELinux策略 884 | semanage 885 | 886 | # 查看文件的SELinux标签 887 | ls -Z 888 | 889 | # 查看进程的SELinux标签 890 | ps -efZ 891 | 892 | # 设置文件的SELinux标签 893 | chcon 894 | chcon -v --type=httpd_sys_content_t /html 895 | chcon -Rv --type=httpd_sys_content_t /html 896 | chcon -R --type container_file_t /var/lib/hostdir 897 | 898 | restorecon -R /html 899 | 900 | # 查看审计日志 901 | ausearch -m avc --start recent 902 | ausearch -ui 0 903 | setsebool -P virt_use_nfs 1 904 | ``` 905 | 906 | 907 | ### 为Pod/容器设置selinux label 908 | ```yaml 909 | ... 910 | securityContext: 911 | seLinuxOptions: 912 | level: "s0:c123,c456" 913 | ... 914 | ``` 915 | 其中seLinuxOptions施加到volume上。一般情况下,只需设置level,其为Pod及其volumes设置Multi-Category Security (MCS) label。 916 | 注意,一旦为Pod设置了MCS label,其它所有相同label的pod均可访问该Pod的volume。 917 | 918 | ### 根据审计日志设置selinux规则 919 | 若遇到selinux拦截操作,例如: 920 | ``` 921 | SELinux is preventing /usr/sbin/lldpad from sendto access on the unix_dgram_socket ... 922 | ``` 923 | 924 | 可以使用如下命令放开拦截: 925 | ```bash 926 | ausearch -m avc --start recent 927 | 928 | # 根据审计日志,查找被拦截的操作,并生成允许的规则 929 | ausearch -c 'lldpad' --raw | audit2allow -M my-lldpad 930 | # 设置selinux,放开拦截 931 | semodule -X 300 -i my-lldpad.pp 932 | ``` 933 | 934 | # 容器运行时 935 | ## runc 936 | ### 常用命令 937 | ```bash 938 | # 查看容器列表 939 | runc --root=/run/containerd/runc/k8s.io list 940 | 941 | # 查看容器进程信息 942 | # 其中可以通过 ctr -n k8s.io c ls | grep 获取 943 | runc --root /run/containerd/runc/k8s.io ps 944 | 945 | # 进入容器执行命令 946 | runc --root /run/containerd/runc/k8s.io exec -t bash 947 | 948 | # 使用resume命令,解除paused状态 949 | runc --root=/run/containerd/runc/k8s.io resume 950 | 951 | # 查看容器状态 952 | runc --root=/run/containerd/runc/k8s.io state <容器ID 64位长号> 953 | 954 | # 更新容器资源配置 955 | runc update --cpu-share 100 956 | ``` 957 | 958 | ### cri-o如何通过conmon调用runc创建容器 959 | ``` 960 | # 命令示例 961 | /usr/bin/runc 962 | --systemd-cgroup 963 | --root=/run/runc 964 | create 965 | --bundle /run/containers/storage/overlay-containers//userdata 966 | --pid-file /run/containers/storage/overlay-containers//userdata/pidfile 967 | 968 | ``` 969 | 970 | ## crun 971 | C语言实现的容器运行时。 972 | 973 | 资料: 974 | * [introduction](https://www.redhat.com/sysadmin/introduction-crun) 975 | * [fedora-31-control-group-v2](https://www.redhat.com/sysadmin/fedora-31-control-group-v2) 976 | 977 | # OCI 978 | ## oci-hooks 979 | 配置一个hook: 980 | ```bash 981 | # cat /etc/containers/oci/hooks.d/hook.json 982 | { 983 | "version": "1.0.0", 984 | "hook": { 985 | "path": "/root/runtime-hook.sh", 986 | "args": ["runtime-hook.sh"] 987 | }, 988 | "when": { 989 | "annotations": { 990 | "^ANNON\\.HEHE$": ".*" 991 | } 992 | }, 993 | "stages": ["prestart"] 994 | } 995 | ``` 996 | 997 | hook执行操作: 998 | ```bash 999 | # cat /root/runtime-hook.sh 1000 | #!/bin/bash 1001 | 1002 | echo "$@" >> /root/runtime-hook.log 1003 | env >> /root/runtime-hook.log 1004 | echo >> /root/runtime-hook.log 1005 | ``` 1006 | 1007 | 另一个完整的例子: 1008 | ``` 1009 | # 容器日志采集信息解析脚本 1010 | cat << EEOOFF > /usr/local/bin/logging-oci-hook.sh 1011 | #!/bin/bash 1012 | 1013 | pwd >> /tmp/logging-oci-hook.log 1014 | cat config.json | jq -r '.annotations["workload.logging.io/paths"]' >> /tmp/logging-oci-hook.log 1015 | EEOOFF 1016 | 1017 | # 为脚本赋予可执行权限 1018 | chmod a+x /usr/local/bin/logging-oci-hook.sh 1019 | 1020 | 1021 | # 配置oci hook,用于监听容器日志采集任务,并触发更新容器日志采集信息 1022 | cat << EEOOFF > /etc/containers/oci/hooks.d/logging-hook.json 1023 | { 1024 | "version": "1.0.0", 1025 | "hook": { 1026 | "path": "/usr/local/bin/logging-oci-hook.sh" 1027 | }, 1028 | "when": { 1029 | "annotations": { 1030 | "^workload.logging.io/paths$": ".*" 1031 | } 1032 | }, 1033 | "stages": ["poststart"] 1034 | } 1035 | EEOOFF 1036 | 1037 | # 重启crio,使oci hook生效 1038 | systemctl restart crio 1039 | 1040 | ``` 1041 | 1042 | # Containerd 1043 | ## 常用操作 1044 | ```bash 1045 | # 在线收集containerd的dump信息,堆栈文件保存在/tmp目录中 1046 | kill -s SIGUSR1 $(pidof containerd) 1047 | 1048 | # 批量导出容器 1049 | ctr -n k8s.io i export image.tar coredns:v1.7.0 kube-proxy:v1.18.8 1050 | 1051 | # 使用containerd客户端 1052 | docker-ctr-current --address unix:///var/run/docker/libcontainerd/docker-containerd.sock 1053 | 1054 | # 日志查看 1055 | # 方式1: 目录 /var/run/containerd/io.containerd.grpc.v1.cri/containers 下能够看到容器stdout和stderr的pipe文件。 1056 | # 直接cat pipe文件,就能看到标准和错误输出。注意,这里只能看到实时输出。 1057 | cat /var/run/containerd/io.containerd.grpc.v1.cri/containers/<容器id>/io/2615573161/<容器id>-stdout 1058 | # 方式2: 目录 /var/log/pods 下能够看到kubelet保存的容器日志输出,kubelet也是使用上了上述1把容器的stdout和stderr输出到/var/log下, 1059 | # 实现查看历史日志得能力,提升易用性。 1060 | cat /var/log/pods/kube-system_apiserver-proxy-xxx/nginx/0.log 1061 | 1062 | # 查看容器指标信息,例如cpu、内存开销 1063 | ctr -n k8s.io t metric 1064 | 1065 | # 挂载镜像 1066 | ctr -n k8s.io i mount centos:8 /mnt 1067 | # 解除挂载 1068 | ctr -n k8s.io i unmount /mnt 1069 | ``` 1070 | 1071 | ## 如何编译containerd 1072 | 可直接在ARM架构的环境编译aarch64,如下示例包含containerd与runc 1073 | ```bash 1074 | docker run -it --privileged --network host\ 1075 | -v /var/lib/containerd \ 1076 | -v ${PWD}/runc:/go/src/github.com/opencontainers/runc \ 1077 | -v ${PWD}/containerd:/go/src/github.com/containerd/containerd \ 1078 | -e GOPATH=/go \ 1079 | -w /go/src/github.com/containerd/containerd containerd/build-aarch64:1.1.0 sh 1080 | # 进入容器里操作 1081 | # 编译 runc 1082 | cd /go/src/github.com/opencontainers/runc 1083 | make 1084 | # 编译 containerd 1085 | cd /go/src/github.com/containerd/containerd 1086 | make 1087 | ``` 1088 | 1089 | # CRI-O 1090 | ```bash 1091 | # 查看当前生效的配置 1092 | crio-status config | grep -i pid 1093 | ``` 1094 | 1095 | ## 统计容器可读可写层存储用量 1096 | ```bash 1097 | # 复杂的方法 1098 | for config in $(ls /var/lib/containers/storage/overlay-containers/*/userdata/config.json) 1099 | do 1100 | diff=$(cat $config 2>/dev/null | jq .root.path -r|sed 's/merged$/diff/g') 1101 | du -s $diff 1102 | done | awk '{s+=$1} END {print s}' 1103 | 1104 | # 简单的方法,统计出容器可读可写层总用量大小 1105 | crictl stats -a -o json | jq -r '.stats[].writableLayer.usedBytes.value' | awk '{s+=$1}END{print s/1024/1024/1024}' 1106 | ``` 1107 | 1108 | ## 指定seccomp profile 1109 | ```bash 1110 | # /etc/crio/crio.conf 1111 | [crio.runtime] 1112 | seccomp_profile = "/etc/crio/seccomp.json" 1113 | ``` 1114 | 1115 | 通过配置空的`seccomp.json`文件,放开所有限制: 1116 | ```bash 1117 | # cat /etc/crio/seccomp.json 1118 | {} 1119 | ``` 1120 | 1121 | ## 容器存储目录 1122 | * `/run/containers/storage/overlay-containers//userdata/`,放置这个pod的`hostname`和`resolv.conf`等。 1123 | * `/run/containers/storage/overlay-containers//userdata/`,放置容器的配置文件、挂载点等。 1124 | 1125 | ## non-root用户使用devices 1126 | 参见 [non-root-containers-and-devices](https://kubernetes.io/blog/2021/11/09/non-root-containers-and-devices/) 。 1127 | 1128 | ```bash 1129 | # 修改crio配置,开启 device_ownership_from_security_context 1130 | cat << EEOOFF > /etc/crio/crio.conf.d/10-device-ownership 1131 | [crio.runtime] 1132 | device_ownership_from_security_context = true 1133 | EEOOFF 1134 | 1135 | # 重启crio使配置生效 1136 | systemctl restart crio 1137 | 1138 | # 检查配置生效 1139 | crio-status c | grep device_ownership_from_security_context 1140 | ``` 1141 | 1142 | ## 检查容器存储数据量是否合理 1143 | overlay元数据中id数: 1144 | ```bash 1145 | sudo cat /var/lib/containers/storage/overlay-layers/layers.json /var/lib/containers/storage/overlay-layers/volatile-layers.json | jq . | grep -c "id\"" 1146 | ``` 1147 | 1148 | 和`/var/lib/containers/storage/overlay`目录下文件夹数(除`l`文件夹外)是否接近: 1149 | ```bash 1150 | sudo ls /var/lib/containers/storage/overlay | wc -l 1151 | ``` 1152 | 1153 | 参考链接[cri-o/issues/6981](https://github.com/cri-o/cri-o/issues/6981#issuecomment-1608606437) 1154 | 1155 | ## 配置修改 1156 | ### 修改容器内ulimit配置 1157 | 在`[crio.runtime]`段下面,增加如下内容,例如将coredump文件限制到2GB大小: 1158 | ```bash 1159 | default_ulimits = [ 1160 | "core=2147483648:2147483648" 1161 | ] 1162 | ``` 1163 | 1164 | 然后重启crio生效(重载reload配置是否就够了): 1165 | ```bash 1166 | systemctl restart crio 1167 | ``` 1168 | 1169 | 注意,后续只有新建的容器才会生效。 1170 | 1171 | ## 问题debug 1172 | **调整日志级别**: 1173 | ```bash 1174 | # 修改日志级别log_level为info、debug或trace 1175 | /etc/crio/crio.conf.d/00-default 1176 | 1177 | # 重载配置 1178 | systemctl reload crio 1179 | ``` 1180 | 1181 | **获取pprof数据**: 1182 | ```bash 1183 | # 通过环境变量,指定开启pprof 1184 | Environment="ENABLE_PROFILE_UNIX_SOCKET=true" 1185 | 1186 | # 获取pprof数据,例如goroutine 1187 | curl --unix-socket /var/run/crio/crio.sock http://localhost/debug/pprof/goroutine?debug=1 1188 | # 例如内存信息 1189 | curl --unix-socket /var/run/crio/crio.sock http://localhost/debug/pprof/heap 1190 | 1191 | # 当crio不响应时获取goroutine调用栈,调用栈信息保存在 /tmp/crio-goroutine-stacks-* 文件 1192 | systemctl kill -s USR1 crio.service 1193 | ``` 1194 | 1195 | **通过unix socket直接调用API**: 1196 | ```bash 1197 | # 查询容器详情 1198 | curl --unix-socket /var/run/crio/crio.sock http://localhost/containers/{CONTAINER_ID} 1199 | ``` 1200 | 1201 | ### 看crio日志 1202 | 1203 | #### 创建容器失败 1204 | ```bash 1205 | # 创建容器失败 1206 | journalctl -u crio | grep "Container creation error" 1207 | ``` 1208 | 1209 | ## Deep Dive 1210 | ### 创建容器 1211 | 创建容器核心逻辑在`createSandboxContainer()`。 1212 | 1213 | cri server -> conmon -> runc -> user container process 1214 | 1215 | ### 列出镜像ImageService/ListImages 1216 | 以`/runtime.v1alpha2.ImageService/ListImages`为例,`storage/storage_transport.go`中会从容器存储中,解析对应的镜像信息并返回。 1217 | 1218 | 核心逻辑在`ParseStoreReference()`的`parsed reference into`。 1219 | 1220 | # podman 1221 | ## 配置管理 1222 | * 配置文件在`/usr/share/containers/`和`/etc/containers/`。 1223 | * 默认seccomp策略文件路径`/usr/share/containers/seccomp.json`。 1224 | 1225 | ## 使用podman查看cri创建的pod 1226 | ```bash 1227 | podman ps --all --external 1228 | podman ps --all --storage 1229 | ``` 1230 | 1231 | ## 容器镜像和overlay/layer对应关系 1232 | 1. `podman images`看到的镜像ID(`IMAGE ID`)即本地缓存镜像的id,具体对应于`/var/lib/containers/storage/overlay-images`目录下一个个文件夹 1233 | 2. `/var/lib/containers/storage/overlay-images/*/manifest`中有容器镜像的`layer`信息及每一层的大小 1234 | 3. ??? 1235 | 1236 | ## 在login后podman的认证信息可能存放的几个地方 1237 | 1. Linux默认在`${XDG_RUNTIME_DIR}/containers/auth.json`,即例如`/run/user/0/containers/auth.json` 1238 | 2. Windows和macOS默认在`$HOME/.config/containers/auth.json` 1239 | 3. 若缺失上述文件,则继续检查`$HOME/.docker/config.json`,即兼容使用`docker login`认证信息 1240 | 1241 | ## 创建manifest list支持多架构镜像 1242 | ```bash 1243 | # 新建一个manifest list 1244 | podman manifest create localhost/flannel:v0.23.0 1245 | # 向manifest list中添加镜像 1246 | podman manifest add localhost/flannel:v0.23.0 foo.bar/dev/flannel:v0.23.0-amd64 foo.bar/dev/flannel:v0.23.0-arm64 1247 | # 【可选】查看manifest list中镜像列表,检查各镜像携带的arch、variant、os等信息 1248 | podman manifest inspect localhost/flannel:v0.23.0 1249 | # 【可选】如果镜像没有arch信息,需要通过annotate为镜像添加arch等信息 1250 | podman manifest annotate --arch "amd64" localhost/flannel:v0.23.0 foo.bar/dev/flannel:v0.23.0-amd64 1251 | podman manifest annotate --arch "arm64" localhost/flannel:v0.23.0 foo.bar/dev/flannel:v0.23.0-arm64 1252 | # 上传manifest list至镜像仓库 1253 | podman manifest push localhost/flannel:v0.23.0 foo.bar/dev/flannel:v0.23.0 1254 | ``` 1255 | 1256 | ## 使用podman统计容器镜像大小 1257 | ```bash 1258 | # 查看容器镜像总大小 1259 | podman system df 1260 | # 使用 Go template 自定义输出 1261 | podman system df --format "{{json .}}" 1262 | 1263 | # 查看各容器镜像大小、shared size和unique size 1264 | podman system df -v 1265 | ``` 1266 | 1267 | ## 常用命令 1268 | ```bash 1269 | # 查看当前挂载的容器镜像 1270 | podman image mount 1271 | 1272 | # 挂载容器镜像 1273 | podman image mount quay.io/openshift-scale/etcd-perf:latest 1274 | 1275 | # 卸载容器镜像 1276 | podman image unmount quay.io/openshift-scale/etcd-perf:latest 1277 | 1278 | # 查看镜像详情 1279 | cat /var/lib/containers/storage/overlay-images/images.json | jq 1280 | 1281 | # 调整日志级别 1282 | podman pull --authfile /path/to/config.json --log-level debug 1283 | 1284 | # 启容器但不分配网络 1285 | podman run -it --rm --net=none centos:latest bash 1286 | ``` 1287 | 1288 | # crictl 1289 | _crictl_ 访问*cri server*,同kubelet的行为一致,因此常用于站在kubelet角度去debug容器运行时。 1290 | 1291 | ## 直接创建容器 1292 | _crictl_ 拉起容器比*podman*等CLI工具麻烦,需要编辑json或yaml格式的配置文件,再拉起容器。而且,其行为同kubelet一致,因此拉起容器前还需要创建pod sandbox容器。 1293 | 1294 | ### 创建Pod Sandbox 1295 | sandbox配置文件`sandbox.json`如下: 1296 | ```json 1297 | { 1298 | "metadata": { 1299 | "name": "sandbox", 1300 | "namespace": "default", 1301 | "attempt": 1, 1302 | "uid": "xxx" 1303 | }, 1304 | "hostname": "POD", 1305 | "log_directory": "/tmp", 1306 | "linux": { 1307 | "security_context": { 1308 | "privileged": true, 1309 | "namespace_options": { 1310 | "network": 2 1311 | } 1312 | } 1313 | } 1314 | } 1315 | ``` 1316 | 1317 | 然后执行如下命令: 1318 | ```bash 1319 | crictl runp sandbox.json 1320 | ``` 1321 | 1322 | ### 创建业务容器 1323 | 业务容器配置文件`container.json`如下: 1324 | ```json 1325 | { 1326 | "metadata":{ 1327 | "name":"container", 1328 | "attempt": 1 1329 | }, 1330 | "image": { 1331 | "image": "centos:latest" 1332 | }, 1333 | "args": [ 1334 | "sleep", "inf" 1335 | ], 1336 | "mounts": [ 1337 | {"container_path":"/dev", "host_path":"/dev"}, 1338 | {"container_path":"/var/log", "host_path":"/var/log"} 1339 | ], 1340 | "log_path": "tmp.log", 1341 | "linux": { 1342 | "security_context": { 1343 | "privileged": true 1344 | } 1345 | } 1346 | } 1347 | ``` 1348 | 1349 | 然后执行如下命令: 1350 | ```bash 1351 | crictl create container.json sandbox.json 1352 | ``` 1353 | 1354 | ### 如何配置 1355 | 参见`vendor/k8s.io/cri-api/pkg/apis/runtime/v1/api.pb.go`中`PodSandboxConfig`和`ContainerConfig`结构体定义。 1356 | 1357 | **注意**和OCI的区别[opencontainers/runtime-spec](https://github.com/opencontainers/runtime-spec/specs-go/config.go) 。 1358 | 1359 | ## 查看容器资源用量 1360 | 1361 | ### 容器可读可写层存储占用top10 1362 | ```bash 1363 | crictl stats -a -o json | jq '.stats[] | .writableLayer.usedBytes.value + " " + .attributes.labels["io.kubernetes.pod.namespace"] + " " + .attributes.labels["io.kubernetes.pod.name"] + " " + .attributes.id' -r | sort -rn | head -n 10 1364 | ``` 1365 | 1366 | ### 容器可读可写层inode占用top10 1367 | ```bash 1368 | crictl stats -a -o json | jq '.stats[] | .writableLayer.inodesUsed.value + " " + .attributes.labels["io.kubernetes.pod.namespace"] + " " + .attributes.labels["io.kubernetes.pod.name"]' -r | sort -rn | head -n 10 1369 | ``` 1370 | 1371 | 1372 | ## 根据进程pid查询pod 1373 | ```bash 1374 | function pid2pod { 1375 | local pid=$1 1376 | if [ -f /proc/${pid}/cgroup ]; then 1377 | local cid=$(cat /proc/${pid}/cgroup | grep ":memory:" | awk -F '/' '{print $NF}' | awk -F ':' '{print $NF}' | sed 's/^cri-containerd-//g' | sed 's/.scope$//g' | grep -v "^crio-") 1378 | if [ "${cid}" = "" ]; then 1379 | # Try cri-o 1380 | cid=$(cat /proc/${pid}/cgroup | grep -m1 "/crio-" | awk -F '/' '{print $NF}' | sed 's/^crio-//g' | sed 's/^conmon-//g' | sed 's/.scope$//g') 1381 | if [ "${cid}" != "" ]; then 1382 | result=$(sudo crictl inspect ${cid} 2>/dev/null | jq -r '.status.labels["io.kubernetes.pod.namespace"]+" "+.status.labels["io.kubernetes.pod.name"]' 2>/dev/null) 1383 | if [ "${result}" != "" ]; then 1384 | echo "${result}" 1385 | else 1386 | sudo crictl inspectp ${cid} 2>/dev/null | jq -r '.status.labels["io.kubernetes.pod.namespace"]+" "+.status.labels["io.kubernetes.pod.name"]' 2>/dev/null 1387 | fi 1388 | fi 1389 | else 1390 | result=$(ctr -n k8s.io c info ${cid} 2>/dev/null | jq -r '.Labels["io.kubernetes.pod.namespace"]+" "+.Labels["io.kubernetes.pod.name"]' 2>/dev/null) 1391 | if [ "${result}" != "" ]; then 1392 | echo "${result}" 1393 | else 1394 | ctr c ls 2>/dev/null | grep ${cid} 2>/dev/null | awk '{print $2}' 2>/dev/null 1395 | fi 1396 | fi 1397 | fi 1398 | } 1399 | 1400 | ``` 1401 | 1402 | ## 根据pod的uid查询pod 1403 | ```bash 1404 | function uid2pod() { 1405 | sudo crictl pods --label io.kubernetes.pod.uid="$1" --output json | jq -r '.items[].metadata | .namespace+"/"+.name' 1406 | } 1407 | ``` 1408 | 1409 | # Docker 1410 | 1411 | ## 容器环境下的swap使用 1412 | 为什么swap不适用于容器平台?我的理解: 1413 | * 有swap在,接近limit时容器内的进程会使用swap“腾出”部分内存,容器limit的限制就得不到遵守,这块同cgroups相关 1414 | * 容器环境下,虽然主机上内存资源充足,但是swap还是会使用,这与swap的设计初衷背道而驰的。 1415 | * 使用swap会严重影响io性能。 1416 | 1417 | 总结,swap是在容器崛起前的产物,当前出现的各类swap问题,归根到底需要swap(内存管理)和cgroup“协商”处理。 1418 | 1419 | 查询占用swap分区Top20的Pods 1420 | ```bash 1421 | #!/bin/bash 1422 | 1423 | for pid in $(top -b -n1 -o SWAP | head -n27 | sed '1,7d' | awk '{print $1}') 1424 | do 1425 | p=${pid} 1426 | while true 1427 | do 1428 | if [ ${p} = 1 -o ${p} = 0 ]; then 1429 | break 1430 | fi 1431 | 1432 | pp=$(ps -o ppid= ${p} | grep -Eo '[0-9]+') 1433 | 1434 | if [ ${pp} = 1 -o ${pp} = 0 ]; then 1435 | break 1436 | fi 1437 | 1438 | search=$(ps -ef | grep "\<${pp}\>" | grep 'docker-containerd-shim') 1439 | if [ "${search}" = "" ]; then 1440 | p=${pp} 1441 | continue 1442 | fi 1443 | 1444 | cid=$(echo ${search} | sed 's/.*docker-containerd-shim//g' | awk '{print $1}') 1445 | cname=$(docker ps --no-trunc | grep ${cid} | awk '{print $NF}') 1446 | if [ "${cname}" = "" ]; then 1447 | break 1448 | fi 1449 | 1450 | OLD_IFS="$IFS" 1451 | IFS="_" 1452 | infos=(${cname}) 1453 | IFS="${OLD_IFS}" 1454 | echo "Pid:$(printf "%6d" ${pid}) $(grep VmSwap /proc/${pid}/status) Pod: ${infos[2]}" 1455 | break 1456 | done 1457 | done 1458 | ``` 1459 | 1460 | ## 深入docker stats命令 1461 | ~~~ 1462 | docker engine-api: func (cli *Client) ContainerStats 1463 | -> dockerd src/github.com/docker/docker/daemon/stats.go:135 daemon.containerd.Stats(c.ID) 1464 | -> containerd runtime/container.go func (c *container) Stats() (*Stat, error) 1465 | -> runtime (docker-runc events --stats container-id) runc/libcontainer/cgroups/fs/memory.go func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error 1466 | -> cgroups (memory) 1467 | 1468 | docker-runc events --stats 9c8ad7d4885e2601a76bc3e1a4883a48a1c83e50ab4b7205176055a6fd6ec548 | jq .data.memory 1469 | docker-runc events --stats 9c8ad7d4885e2601a76bc3e1a4883a48a1c83e50ab4b7205176055a6fd6ec548 | jq .data.memory.usage.usage 1470 | 的值直接取自: 1471 | cat /sys/fs/cgroup/memory/kubepods/burstable/podaebd4ae8-8e1b-11e8-b174-3ca82ae95d28/9c8ad7d4885e2601a76bc3e1a4883a48a1c83e50ab4b7205176055a6fd6ec548/memory.usage_in_bytes 1472 | ~~~ 1473 | 1474 | 1475 | ## Docker问题定位 1476 | 1477 | ### Docker卡死hang住 1478 | ```bash 1479 | # 检查dockerd是否响应服务请求 1480 | curl --unix-socket /var/run/docker.sock http://v1.26/containers/json?all=1 1481 | 1482 | # 线程调用栈输出至/var/run/docker文件夹 1483 | kill -SIGUSR1 1484 | 1485 | # containerd调用栈输出至messages,也会输出文件至/tmp目录 1486 | kill -SIGUSR1 1487 | 1488 | # 获取containerd-shim堆栈,堆栈输出至 shim.stdout.log 1489 | # 注意,需要开启containerd-shim -debug 1490 | cat /var/lib/containerd/io.containerd.runtime.v1.linux/moby//shim.stdout.log 1491 | kill -SIGUSR1 1492 | ``` 1493 | 1494 | 1495 | ## Docker操作 1496 | 1497 | ### 常用操作 1498 | 1499 | ```bash 1500 | docker system prune # 存储清理,可以加上参数 -a 1501 | docker system df # 查看容器、镜像的存储用量 1502 | docker 重启是增加 live-restore 选项,可以降低重启docker的开销,重启docker daemon的时候容器不重启 除非bip这些变了。 1503 | docker push xxxx # 将镜像push到私有registry,注意,nodeB希望从nodeA的registry获取镜像时,nodeA上必须先push到registry才行 1504 | docker pull xxxx # 从registry上下载镜像至本地 1505 | docker run -it --name test --net container:1a9bfd40505e --entrypoint=/usr/bin/sh openstack-glance:RC2 # 共享容器网络,glance中携带tcpdump命令,可网络抓包 1506 | docker run -it --name test --net=host openstack-keystone:D1101 bash 1507 | docker rm -f $(docker ps | grep haproxy | awk '{print $1}') 1508 | docker run -it --net=host centos:base bash # 共享HOST网络 1509 | docker stats --no-stream # 查看容器状态、资源使用情况 1510 | docker run -d -p 881 -v /root/sample/website:/var/www/html/website:rw --privileged=true test-img:1.0 nginx # 映射时需要加--privileged=true防止没有权限 1511 | docker attach xxxx # 绑定到容器的stdio 1512 | docker exec d8c875f38278 bash -c "echo '1.2.3.4 hehe' >> /etc/hosts" # 进入容器执行命令 1513 | docker inspect -f "{{json .Mounts}}" b2aed79fec98 1514 | docker inspect ${container} --format '{{.State.Pid}}' # 获取容器的entrypoint进程pid 1515 | docker stats --format "{{.Name}} {{.MemPerc}}" 1516 | docker images --format "{{.Repository}}:{{.Tag}}" 1517 | docker info -f '{{json .}}' | jq # 格式化输出 1518 | docker load --input images.tar.gz 1519 | docker save myimage:latest | gzip > myimage_latest.tar.gz 1520 | curl -v -X POST http://:2375/v1.26/images/load -T xxx.tar # 调用docker接口load容器镜像 1521 | ``` 1522 | 1523 | 1524 | ### 提取镜像rootfs文件 1525 | ```bash 1526 | docker export $(docker create busybox:1.0.0) > busybox.tar 1527 | mkdir rootfs 1528 | tar -C rootfs -xf busybox.tar 1529 | ``` 1530 | 1531 | 1532 | ### docker build构建镜像 1533 | ```bash 1534 | # 常规操作 1535 | docker build -t centos:base -f Dockerfile . 1536 | 1537 | # 为容器镜像增加label的简便操作 1538 | echo "FROM centos:7" | docker build --label foo="bar" --label key="value" -t "centos:7-labeled" - 1539 | ``` 1540 | 1541 | 1542 | ### 安装指定版本docker 1543 | 操作如下: 1544 | ```bash 1545 | yum -y install yum-utils 1546 | sudo yum-config-manager --add-repo http://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo 1547 | # 查看可安装docker版本 1548 | yum list docker-ce --showduplicates | sort - 1549 | yum install -y docker-ce-19.03.13-3.el7 1550 | systemctl enable docker.service 1551 | systemctl restart docker 1552 | ``` 1553 | 也支持在既有`Containerd`的节点上,安装Docker。 1554 | 1555 | ### 关闭docker0 1556 | K8s集群网络插件打通容器网络,大多未使用`docker0`,另一方面`docker0`默认占用`172.17.0.1/16`网段,IP地址存在冲突可能,为此考虑关闭`docker0`。 1557 | 注意,要让网络配置修改生效,必须先把容器全部停掉,具体操作如下: 1558 | 1. `systemctl stop kubelet` 让kubelet停掉,不然它又会拉起容器 1559 | 2. `docker stop $(docker ps -q)` 停止所有docker容器 1560 | 3. 修改 `/etc/docker/daemon.json`,在其中增加`"bridge": "none"`将docker0网桥干掉 1561 | 4. `systemctl restart docker` 重启docker服务 1562 | 5. `systemctl start kubelet` 启动kubelet服务 1563 | 1564 | 1565 | ### 修改容器的ulimit默认配置 1566 | 在`/etc/docker/daemon.json`中增加`default-ulimits`,修改容器ulimit默认配置 1567 | ```bash 1568 | # cat /etc/docker/daemon.json 1569 | { 1570 | "default-ulimits": { 1571 | "core": { 1572 | "Name": "core", 1573 | "Hard": 0, 1574 | "Soft": 0 1575 | } 1576 | } 1577 | } 1578 | ``` 1579 | 此后容器内不再输出`coredump`文件,进入容器后确认: 1580 | ```bash 1581 | bash-4.4# cat /proc/$$/limits 1582 | Limit Soft Limit Hard Limit Units 1583 | ... 1584 | Max core file size 0 0 bytes 1585 | ... 1586 | ``` 1587 | 1588 | 1589 | ### 使用docker-storage-setup初始化docker存储 1590 | 节点上安装docker,并使用docker-storage-setup初始化docker存储。 1591 | docker-storage-setup仅依赖配置文件`/etc/sysconfig/docker-storage-setup`,会根据配置文件中的VG自动部署docker storage,包括: 1592 | 1. 创建lv 1593 | 2. 创建docker用的dm thin-pool 1594 | 3. 为docker的thin-pool配置自动扩展(auto extend) 1595 | 4. 为docker生成相应的存储配置(/etc/sysconfig/docker-storage) 1596 | 1597 | docker-storage-setup实则软链接到`/usr/bin/container-storage-setup`。 1598 | `container-storage-setup`由RedHat开发,其目的为"This script sets up the storage for container runtimes"。 1599 | `container-storage-setup`内容可直接阅读脚本。 1600 | 其配置文件路径为`/usr/share/container-storage-setup`,有效内容如下: 1601 | ```bash 1602 | [root@zy-op-m224 ~]# cat /usr/share/container-storage-setup/container-storage-setup | grep -v "^$\|^#" 1603 | STORAGE_DRIVER=devicemapper 1604 | DATA_SIZE=40%FREE 1605 | MIN_DATA_SIZE=2G 1606 | CHUNK_SIZE=512K 1607 | GROWPART=false 1608 | AUTO_EXTEND_POOL=yes 1609 | POOL_AUTOEXTEND_THRESHOLD=60 1610 | POOL_AUTOEXTEND_PERCENT=20 1611 | DEVICE_WAIT_TIMEOUT=60 1612 | WIPE_SIGNATURES=false 1613 | CONTAINER_ROOT_LV_SIZE=40%FREE 1614 | ``` 1615 | 1616 | 1617 | ### 构建Docker镜像最佳实践(Alpine) 1618 | Dockerfile同Makefile类似,借助基础镜像和Dockerfile,能方便的制作出干净、内容可知的容器镜像,同`docker cp + commit`或`docker export`临时方法相比,采用Dockerfile更适合制作正式的、用于发布交付的镜像。 1619 | 1620 | 镜像过大导致: 1621 | 1. 离线安装包过大; 1622 | 2. 过大的安装包和镜像,传输、复制时间过长,系统部署时间显著增加; 1623 | 3. 过大的镜像,还会消耗过多的容器存储资源。 1624 | 1625 | 针对上述问题,以HAProxy的alpine版镜像为例,根据其官方Dockerfile,介绍如何使用“alpine基础镜像+Dockerfile”方式,制作干净、小巧且够用的Docker镜像,简单归纳如下: 1626 | ```Dockerfile 1627 | # 【可选】 1628 | # 设置环境变量,主要包括软件的版本信息和源码文件MD5校验数据 1629 | ENV VERSION 1.6 1630 | ENV MD5 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 1631 | 1632 | # 【可选】 1633 | # 安装alpine官方镜像没有,但后期需要使用的工具,以socat为例 1634 | RUN apk add --no-cache socat 1635 | 1636 | # 【可选】 1637 | # 安装构建、编译工具,注意,在最后需要删除这些工具 1638 | RUN apk add --no-cache --virtual .build-deps gcc make binutils 1639 | 1640 | # 【可选】 1641 | # 下载源码、编译、安装,并清除源码和中间文件 1642 | RUN wget -O source-file.tar.gz "http://www.hehe.org/path/to/source-file-${VERSION}.tar.gz" 1643 | RUN echo "$MD5 *source-file.tar.gz" | md5sum -c 1644 | RUN xxx #解压源文件、编译、安装、并删除源文件和中间文件 1645 | 1646 | # 【可选】 1647 | # 删除.build-deps组中所有package 1648 | RUN apk del .build-deps 1649 | 1650 | 1651 | # 设置Docker的ENTRYPOINT和CMD 1652 | ENTRYPOINT ["/docker-entrypoint.sh"] 1653 | CMD ["haproxy", "-f", "/usr/local/etc/haproxy/haproxy.cfg"] 1654 | ``` 1655 | 1656 | 1657 | 1658 | 1659 | ### 强制删除容器 1660 | 1661 | 当`docker rm -f`无法删除容器时,可以找到容器的`docker-container-shim`进程,删除该进程可终结容器,但需关注容器对应的/dev/dm-xx设备。 1662 | 1663 | ### 找到容器使用的dm-xx设备 1664 | 容器的运行时bundle信息在`/var/run/docker/libcontainerd/xxxxxcidxxxxx/config.json`中,使用如下命令 1665 | ```bash 1666 | cat config.json | jq .root.path -r 1667 | /var/lib/docker/devicemapper/mnt/9a7cc2bf60a1b4b9cfc96212b24528c03f7d74b1eabaf640341348e82e61fd15/rootfs 1668 | ``` 1669 | 其中`9a7cc2xxx`就是`devicemapper`设备的id,可通过`dmsetup info`查找到具体的`dm-xx`信息 1670 | 1671 | 1672 | ### docker pull加速 1673 | 1674 | ```bash 1675 | # 在/etc/docker/daemon.json中配置 1676 | { 1677 | "registry-mirrors": ["https://registry.docker-cn.com","https://3laho3y3.mirror.aliyuncs.com"] 1678 | } 1679 | # 然后重启dockerd 1680 | ``` 1681 | 1682 | ### docker使用代理 1683 | 1684 | docker服务设置环境变量以使用代理(也可以直接修改docker.service) 1685 | 1686 | ```bash 1687 | mkdir /etc/systemd/system/docker.service.d 1688 | cat </etc/systemd/system/docker.service.d/http-proxy.conf 1689 | [Service] 1690 | Environment="HTTP_PROXY=http://127.0.0.1:30000/" 1691 | Environment="HTTPS_PROXY=http://127.0.0.1:30000/" 1692 | Environment="NO_PROXY=*.foo.bar,10.0.0.0/8,192.168.*.*" 1693 | EOF 1694 | systemctl daemon-reload 1695 | # 检查环境变量已配置 1696 | systemctl show --property Environment docker 1697 | # 重启docker使配置生效 1698 | systemctl restart docker 1699 | ``` 1700 | 1701 | **注意**,在终端中设置代理时,采用小写,例如: 1702 | ``` 1703 | export https_proxy=http://192.168.58.1:8080/ 1704 | export http_proxy=http://192.168.58.1:8080/ 1705 | # 白名单方式,指定不代理的地址或域名 1706 | export no_proxy=*.local,10.0.0.0/8,192.168.*.* 1707 | ``` 1708 | 1709 | 1710 | 1711 | ### 容器文件系统使用率统计 1712 | 1713 | ```bash 1714 | umount /mnt 2> /dev/null 1715 | for dm in $(ls /dev/mapper/docker-253* | grep -v pool) 1716 | do 1717 | mount ${dm} /mnt 1718 | usage=$(stat -f -c '100-%a*%S/1024*100/10471424' /mnt | bc) 1719 | umount /mnt 1720 | dmid=$(echo ${dm} | sed 's/.*-//g') 1721 | containerid=$(grep -rn ${dmid} /var/run/docker/libcontainerd/*/config.json | sed 's/\/config.json:1.*//g' | sed 's/.*libcontainerd\///g') 1722 | containername=$(docker ps --no-trunc | grep ${containerid} | awk '{print $NF}') 1723 | echo "${dm} $(printf "%3d%%" ${usage}) ${containername}" | grep -v "k8s_POD_" 1724 | done 1725 | ``` 1726 | 1727 | 1728 | ### 强制重启Docker服务 1729 | **未经验证**: 1730 | ```bash 1731 | systemctl stop docker 1732 | killall dockerd 1733 | systemctl start docker 1734 | ``` 1735 | 1736 | 1737 | # 镜像仓库和工具 1738 | ## skopeo 1739 | 常用命令 1740 | ```bash 1741 | skopeo inspect docker://foo.bar/image:tag 1742 | skopeo list-tags docker://foo.bar/image 1743 | # 同步镜像的所有tag,当前还不支持多架构 1744 | skopeo sync --src docker --dest dir foo.bar/image /mnt/usb --tls-verify=false --preserve-digests 1745 | 1746 | # 复制镜像 1747 | skopeo copy --dest-tls-verify=false docker://docker.io/image:v1 docker://my.registry.hehe/image:v1 1748 | # 支持所有架构 1749 | skopeo copy docker://foo.bar/image:tag dir:/mnt/usb --tls-verify=false --multi-arch=all --preserve-digests 1750 | ``` 1751 | 1752 | ### 镜像搬运工 1753 | ```bash 1754 | skopeo login registry-1.docker.io -u -p 1755 | skopeo login image.foo.bar -u -p --tls-verify=false 1756 | 1757 | i=centos:latest 1758 | skopeo copy --multi-arch all --dest-tls-verify=false docker://docker.io/$i docker://image.foo.bar/dev/$i --insecure-policy --override-os linux 1759 | ``` 1760 | 1761 | 更复杂的例子: 1762 | ```bash 1763 | skopeo copy --format=v2s2 \ 1764 | --remove-signatures \ 1765 | --authfile=${authfile} \ 1766 | --dest-tls-verify=false \ 1767 | --override-arch=${arch} \ 1768 | --override-os=linux \ 1769 | docker://${src_image} \ 1770 | docker://${dest_image}-${arch} 1771 | ``` 1772 | 1773 | ### Windows环境上源码运行skopeo搬运镜像 1774 | ```bash 1775 | # Windows上构建skopeo可执行文件 1776 | GOOS=windows GOARCH=amd64 go build -tags "containers_image_openpgp" -o bin/skopeo ./cmd/skopeo 1777 | 1778 | # 增加 --override-os 搬运指定系统platform的镜像,例如 linux 1779 | # 增加 --insecure-policy 跳过容器安全策略检查 /etc/containers/policy.json 1780 | skopeo copy --multi-arch all --dest-tls-verify=false docker://docker.io/$i docker://image.foo.bar/dev/$i --insecure-policy --override-os linux 1781 | ``` 1782 | -------------------------------------------------------------------------------- /docs/golang.md: -------------------------------------------------------------------------------- 1 | # TOC 2 | 3 | 4 | * [TOC](#toc) 5 | * [开发](#开发) 6 | * [网络编程](#网络编程) 7 | * [http客户端超时和请求时context超时的区别](#http客户端超时和请求时context超时的区别) 8 | * [常用操作](#常用操作) 9 | * [安装可执行文件](#安装可执行文件) 10 | * [避免1.17的编译注释被自动修改](#避免117的编译注释被自动修改) 11 | * [编译构建](#编译构建) 12 | * [build tag](#build-tag) 13 | * [用法](#用法) 14 | * [与](#与) 15 | * [或](#或) 16 | * [非](#非) 17 | * [传tag](#传tag) 18 | * [buildinfo](#buildinfo) 19 | * [裁剪可执行文件中mod version信息](#裁剪可执行文件中mod-version信息) 20 | * [方法一:关闭mod](#方法一关闭mod) 21 | * [方法二:build时不带mod version](#方法二build时不带mod-version) 22 | * [通过ldflags在编译阶段设置变量值](#通过ldflags在编译阶段设置变量值) 23 | * [示例:使用git describe获取版本](#示例使用git-describe获取版本) 24 | * [示例:使用semver库检查版本](#示例使用semver库检查版本) 25 | * [常用命令](#常用命令) 26 | * [如何Debug Golang程序](#如何debug-golang程序) 27 | * [打印堆栈](#打印堆栈) 28 | * [使用devle调试Go程序](#使用devle调试go程序) 29 | * [使用go tool trace追踪Go程序](#使用go-tool-trace追踪go程序) 30 | * [使用pprof定位Go程序问题](#使用pprof定位go程序问题) 31 | * [程序集成pprof包](#程序集成pprof包) 32 | * [查看goroutine信息](#查看goroutine信息) 33 | * [查看heap信息](#查看heap信息) 34 | * [查看cpu的profile信息](#查看cpu的profile信息) 35 | * [示例:使用pprof定位kube-apiserver问题](#示例使用pprof定位kube-apiserver问题) 36 | * [示例:使用pprof定位kubelet问题](#示例使用pprof定位kubelet问题) 37 | * [golang diagnostics](#golang-diagnostics) 38 | * [识别gc性能问题](#识别gc性能问题) 39 | * [Deep Dive系列](#deep-dive系列) 40 | * [http.Transport中连接池管理](#httptransport中连接池管理) 41 | * [atomic原子操作](#atomic原子操作) 42 | * [使用Value替代Bool](#使用value替代bool) 43 | * [使用Value替代Pointer](#使用value替代pointer) 44 | * [cgo](#cgo) 45 | * [GOARCH跨平台构建](#goarch跨平台构建) 46 | * [go:embed](#go--embed) 47 | * [底层原理](#底层原理) 48 | * [通过goproxy代理解决package下载问题](#通过goproxy代理解决package下载问题) 49 | * [示例](#示例) 50 | * [启HTTP服务](#启http服务) 51 | * [测试cpu性能](#测试cpu性能) 52 | * [代码实例](#代码实例) 53 | * [自定义排序](#自定义排序) 54 | * [在多个地址/端口上监听](#在多个地址端口上监听) 55 | * [为mutex锁设置超时](#为mutex锁设置超时) 56 | 57 | 58 | # 开发 59 | ## 网络编程 60 | ### http客户端超时和请求时context超时的区别 61 | TODO: [Go http client timeout vs context timeout](https://stackoverflow.com/questions/64129364/go-http-client-timeout-vs-context-timeout) 62 | 63 | 64 | # 常用操作 65 | 66 | ## 安装可执行文件 67 | 以安装*controller-gen*为例: 68 | ```bash 69 | mkdir /tmp/bin 70 | GOBIN=/tmp/bin go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.11.4 71 | ``` 72 | 其中指定了版本v0.11.4。 73 | 74 | ## 避免1.17的编译注释被自动修改 75 | ```bash 76 | for s in $(git status | grep modified | awk '{print $2}'); do 77 | c=$(head -n1 $s | grep "^//go:build" -c) 78 | if [ $c -eq 1 ]; then 79 | sed -i '1d' $s 80 | fi 81 | done 82 | ``` 83 | 84 | ```bash 85 | # 编译静态链接的可执行文件 86 | CGO_ENABLED=0 go build -o harbor_ui github.com/vmware/harbor/src/ui 87 | 88 | # 使用vendor 89 | go build -mod vendor ./pkg/agent 90 | ``` 91 | 92 | # 编译构建 93 | ## build tag 94 | ### 用法 95 | 在待控制的源文件头加: 96 | ``` 97 | // +build tag_name 98 | ``` 99 | 编译时需指定如下`tag`,才将源文件编进去,具体操作如下: 100 | ``` 101 | go build -tags tag_name 102 | ``` 103 | 详见 [customizing-go-binaries-with-build-tags](https://www.digitalocean.com/community/tutorials/customizing-go-binaries-with-build-tags) 104 | 105 | ### 与 106 | ``` 107 | // +build linux,cgo 108 | ``` 109 | 110 | 或者 111 | 112 | ``` 113 | // +build linux 114 | // +build cgo 115 | ``` 116 | 117 | ### 或 118 | ``` 119 | // +build linux darwin 120 | ``` 121 | 122 | ### 非 123 | ``` 124 | // +build !linux 125 | ``` 126 | 127 | ### 传tag 128 | ``` 129 | go build -tags=linux 130 | go build -tags="linux cgo" 131 | go build -tags=linux,cgo 132 | ``` 133 | 134 | **注意**,如下方式会导致前面的tag被覆盖,不能这样使用: 135 | ``` 136 | go build -tags=linux -tags=cgo 137 | ``` 138 | 139 | ## buildinfo 140 | - fix: [cmd/go: do not include module info when -buildinfo=false](https://go-review.googlesource.com/c/go/+/376674) 141 | - issue: [cmd/go: no way to omit package information from binary](https://github.com/golang/go/issues/50501) 142 | 143 | ## 裁剪可执行文件中mod version信息 144 | ### 方法一:关闭mod 145 | ```bash 146 | # go build时关闭GO111MODULE 147 | GO111MODULE="off" go build xxx 148 | # 或者 149 | GO111MODULE="off" make xxx 150 | ``` 151 | 152 | ### 方法二:build时不带mod version 153 | 参见 [go/issues/50501](https://github.com/golang/go/issues/50501) ,已不支持去除mod信息。 154 | 155 | ## 通过ldflags在编译阶段设置变量值 156 | 例如在 *pkg/version/version.go* 中定义有变量: 157 | ```golang 158 | var BuildVersion = "xxxx" 159 | ``` 160 | 161 | 编译时,通过`-ldflags`设置`BuildVersion`变量值: 162 | ```bash 163 | GLDFLAGS="-X ${REPO}/pkg/version.Raw=${VERSION_OVERRIDE} -X ${REPO}/pkg/version.BuildVersion=${BUILD_VERSION}" 164 | go build -ldflags "${GLDFLAGS}" ... 165 | ``` 166 | 167 | ### 示例:使用git describe获取版本 168 | > VERSION_OVERRIDE=$(git describe --abbrev=8 --dirty --always) 169 | 170 | ### 示例:使用semver库检查版本 171 | ```golang 172 | import ( 173 | "strings" 174 | "github.com/blang/semver/v4" 175 | ) 176 | 177 | var ( 178 | Raw = "v0.0.1" 179 | 180 | // Version is the semver representation of the version. 181 | Version = semver.MustParse(strings.TrimLeft(Raw, "v")) 182 | ) 183 | ``` 184 | 185 | ## 常用命令 186 | ```bash 187 | # 查看exec-binary-file中使用的mod的version 188 | go version -m exec-binary-file 189 | 190 | # 老旧代码,未使用mod的编译方式,以 k8s-dummy-device-plugin 为例 191 | # 代码放 /usr/src 目录下,进入 /usr/src/k8s-dummy-device-plugin 目录 192 | GOPATH=/usr GO111MODULE=off CGO_ENABLED=0 go build -a -o k8s-dummy-device-plugin dummy.go 193 | 194 | ``` 195 | 196 | # 如何Debug Golang程序 197 | 198 | ## 打印堆栈 199 | 在最佳实践中,Golang程序会监听signal,一旦接收的对应的信号就打印堆栈信息,用于debug。 200 | 如下示例摘取自`docker/containerd`: 201 | ```go 202 | import ( 203 | "runtime" 204 | ) 205 | 206 | // DumpStacks dumps the runtime stack. 207 | func dumpStacks() { 208 | var ( 209 | buf []byte 210 | stackSize int 211 | ) 212 | bufferLen := 16384 213 | for stackSize == len(buf) { 214 | buf = make([]byte, bufferLen) 215 | stackSize = runtime.Stack(buf, true) 216 | bufferLen *= 2 217 | } 218 | buf = buf[:stackSize] 219 | logrus.Infof("=== BEGIN goroutine stack dump ===\n%s\n=== END goroutine stack dump ===", buf) 220 | } 221 | 222 | func setupDumpStacksTrap() { 223 | c := make(chan os.Signal, 1) 224 | signal.Notify(c, syscall.SIGUSR1) 225 | go func() { 226 | for range c { 227 | dumpStacks() 228 | } 229 | }() 230 | } 231 | 232 | func main() { 233 | ... 234 | setupDumpStacksTrap() 235 | ... 236 | } 237 | ``` 238 | 239 | ## 使用devle调试Go程序 240 | 参见 [项目地址](https://github.com/go-delve/delve)。 241 | 242 | 243 | ## 使用go tool trace追踪Go程序 244 | 使用`go tool trace`能有效追踪程序执行性能问题、死锁等问题。 245 | 246 | TODO 247 | 248 | 参考资料: 249 | - [Golang 大杀器之跟踪剖析 trace](https://segmentfault.com/a/1190000019736288) 250 | 251 | 252 | ## 使用pprof定位Go程序问题 253 | 参考资料: 254 | - https://segmentfault.com/a/1190000039649589 255 | - https://www.kubernetes.org.cn/3119.html 256 | - https://pkg.go.dev/net/http/pprof 257 | - https://lightcone.medium.com/how-to-profile-go-programs-c6c00e8f2ebf 258 | - TODO https://www.huaweicloud.com/articles/760089e5e8665e2397024ce2b9c39871.html 259 | - TODO https://go.dev/blog/pprof 260 | - TODO https://github.com/rsc/benchgraffiti 261 | 262 | ### 程序集成pprof包 263 | ```golang 264 | package main 265 | 266 | import ( 267 | "fmt" 268 | "log" 269 | "net/http" 270 | _ "net/http/pprof" 271 | ) 272 | 273 | func main() { 274 | fmt.Printf("start\n") 275 | log.Println(http.ListenAndServe("localhost:6060", nil)) 276 | fmt.Printf("finished\n") 277 | } 278 | ``` 279 | 280 | 编译并运行上述程序 281 | ```bash 282 | go build -o demo demo.go && ./demo & 283 | ``` 284 | 285 | #### 查看goroutine信息 286 | ```bash 287 | go tool pprof http://localhost:6060/debug/pprof/goroutine 288 | (pprof) top # 查看“执行数量”前top的goroutine 289 | (pprof) traces # 查看goroutine调用栈 290 | (pprof) list # 查看函数源码 291 | ``` 292 | 293 | 或者直接下载`goroutine`堆栈文件: 294 | ```bash 295 | curl http://localhost:6060/debug/pprof/goroutine?debug=2 >> goroutine.txt 296 | # 或者 297 | curl http://localhost:6060/debug/pprof/goroutine?debug=1 >> goroutine.txt 298 | ``` 299 | 300 | #### 查看heap信息 301 | ```bash 302 | go tool pprof http://localhost:6060/debug/pprof/heap 303 | (pprof) top 304 | ``` 305 | 306 | #### 查看cpu的profile信息 307 | ```bash 308 | go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30 309 | (pprof) top 310 | ``` 311 | 312 | ### 示例:使用pprof定位kube-apiserver问题 313 | kube-apiserver集成了pprof工具,可以通过/debug/prof/*获得kube-apiserver的heap、profile等信息: 314 | ```bash 315 | # 首先开启代理,会监听 127.0.0.1:8001 316 | kubectl proxy 317 | # 已采集的性能数据,可以启web server访问 318 | go tool pprof -http=127.0.0.1:8088 /path/to/pprof.kube-apiserver.goroutine.001.pb.gz 319 | # 或者浏览器交互式访问远端pprof服务 320 | go tool pprof -http=127.0.0.1:8088 http://1.2.3.4:12345/debug/pprof/heap 321 | # 也可以交互式访问 322 | go tool pprof /path/to/pprof.kube-apiserver.goroutine.001.pb.gz 323 | 324 | # 当通过web可视化访问时,可能提示“Failed to execute dot. Is Graphviz installed?”,需要安装graphviz 325 | # 命令如下,参见链接 https://graphviz.org/download/ 326 | yum install graphviz 327 | # windows从 https://graphviz.org/download/#windows 下载并安装 328 | # 然后设置PATH,将graphviz的bin(默认C:\Program Files\Graphviz\bin)添加到PATH环境变量中 329 | 330 | # 内存heap信息 331 | go tool pprof http://127.0.0.1:8001/debug/pprof/heap 332 | # 进入交互界面后,输入top 20查看内存使用前20的函数调用 333 | top 20 334 | 335 | # goroutine堆栈信息 336 | go tool pprof http://127.0.0.1:8001/debug/pprof/goroutine 337 | # 进入交互界面,查看“执行数量”前top的goroutine 338 | top 339 | # 查看goroutine调用栈 340 | traces 341 | # 查看代码详情 342 | list 343 | # 获取 goroutine pprof 文件后,直接打开 344 | TODO 345 | 346 | # 获取profile文件: 347 | go tool pprof http://127.0.0.1:8001/debug/pprof/profile 348 | # 查看30s的CPU Profile 349 | go tool pprof http://127.0.0.1:8001/debug/pprof/profile?seconds=30 350 | 351 | # 当程序里调用 runtime.SetBlockProfileRate 后,查看 goroutine blocking profile 352 | go tool pprof http://127.0.0.1:8001/debug/pprof/block 353 | 354 | # 当程序里调用 runtime.SetMutexProfileFraction 后,查看 contended mutexes 锁的持有者 355 | go tool pprof http://127.0.0.1:8001/debug/pprof/mutex 356 | 357 | # 获取并分析5秒的Trace追踪信息 358 | wget -O trace.out http://127.0.0.1:8001/debug/pprof/trace?seconds=5 359 | go tool trace trace.out 360 | 361 | # 查阅所有profile信息,浏览器打开如下链接: 362 | # http://127.0.0.1:8001/debug/pprof/ 363 | ``` 364 | 365 | 366 | ### 示例:使用pprof定位kubelet问题 367 | ```bash 368 | # master节点上,开启debug代理 369 | kubectl proxy 370 | 371 | node=<问题节点> 372 | 373 | # 【可选】动态调整kubelet日志级别 374 | curl -X PUT http://127.0.0.1:8001/api/v1/nodes/${node}/proxy/debug/flags/v -d "4" 375 | 376 | # 收集pprof 377 | wget -O ${node}-profile-$(date +"%y%m%d%H%M") http://127.0.0.1:8001/api/v1/nodes/${node}/proxy/debug/pprof/profile 378 | wget -O ${node}-heap-$(date +"%y%m%d%H%M") http://127.0.0.1:8001/api/v1/nodes/${node}/proxy/debug/pprof/heap 379 | curl http://127.0.0.1:8001/api/v1/nodes/${node}/proxy/debug/pprof/goroutine?debug=1 >> ${node}-goroutine-debug1-$(date +"%y%m%d%H%M") 380 | curl http://127.0.0.1:8001/api/v1/nodes/${node}/proxy/debug/pprof/goroutine?debug=2 >> ${node}-goroutine-debug2-$(date +"%y%m%d%H%M") 381 | 382 | # 打开pprof 383 | go tool pprof -http :8080 ./ 384 | go tool pprof -http :8080 ./ 385 | ``` 386 | 387 | ## golang diagnostics 388 | TODO: https://golang.org/doc/diagnostics 389 | 390 | ## 识别gc性能问题 391 | ```bash 392 | # GC耗时指标 393 | kubectl get --raw /api/v1/nodes/single/proxy/metrics | grep go_gc_pauses_seconds_bucket 394 | ``` 395 | 396 | # Deep Dive系列 397 | ## http.Transport中连接池管理 398 | `http.Transport`的`getConn()`方法,从连接池中获取空闲连接,或新建一个连接。 399 | 400 | ## atomic原子操作 401 | TODO: 402 | - [Go 语言标准库中 atomic.Value 的前世今生](https://blog.betacat.io/post/golang-atomic-value-exploration/) 403 | 404 | ### 使用Value替代Bool 405 | ```golang 406 | ... 407 | showHidden atomic.Value 408 | ... 409 | func ShouldShowHidden() bool { 410 | return showHidden.Load() != nil && showHidden.Load().(bool) 411 | } 412 | ``` 413 | 414 | ### 使用Value替代Pointer 415 | ```golang 416 | ... 417 | cache atomic.Value 418 | ... 419 | 420 | var resettedHint bool = true 421 | 422 | // Reset 423 | cache.Store(&resettedHint) 424 | 425 | // Main process logic 426 | cacheLoad, ok := cache.Load().(*cachedGroupList) 427 | if ok { 428 | return cacheLoad 429 | } else { 430 | cached := &cachedGroupList{ 431 | cachedResponse: response, 432 | cachedResponseETag: etag, 433 | } 434 | cache.Store(cached) 435 | return cached 436 | } 437 | ``` 438 | 439 | ## cgo 440 | TODO: https://chai2010.cn/advanced-go-programming-book/ch2-cgo/ch2-02-basic.html 441 | 442 | ## GOARCH跨平台构建 443 | ```bash 444 | # 查看golang支持的OS和ARCH组合 445 | go tool dist list 446 | 447 | # 在x86-64 windows环境编译linux arm64架构的可执行文件 448 | CGO_ENABLED=0 LD_FLAGS=-s GOARCH=arm64 GOOS=linux go build -C pkg/recommender -mod vendor -o recommender-arm64 449 | ``` 450 | 451 | ## go:embed 452 | TODO 453 | ```golang 454 | //go:embed manifests/guard.yaml 455 | var podTemplate []byte 456 | ``` 457 | 458 | ## 底层原理 459 | * TODO: [「Golang进阶之路」底层原理篇]https://learnku.com/articles/89660 460 | 461 | # 通过goproxy代理解决package下载问题 462 | ```bash 463 | go env -w GO111MODULE=on 464 | go env -w GOPROXY=https://goproxy.cn,direct 465 | 466 | # 设置不走 proxy 的私有仓库,多个用逗号相隔(可选) 467 | go env -w GOPRIVATE=*.corp.example.com 468 | 469 | # 设置不走 proxy 的私有组织(可选) 470 | go env -w GOPRIVATE=example.com/org_name 471 | ``` 472 | 参见[goproxy官网](https://goproxy.io/zh/) 473 | 474 | 475 | # 示例 476 | 477 | ## 启HTTP服务 478 | `http.go`文件内容如下: 479 | ```golang 480 | package main 481 | 482 | import ( 483 | "net/http" 484 | ) 485 | 486 | func main() { 487 | http.Handle("/", http.FileServer(http.Dir("./"))) 488 | http.ListenAndServe(":34567", nil) 489 | } 490 | ``` 491 | 执行命令`go run http.go`启动服务。 492 | 493 | ## 测试cpu性能 494 | ```golang 495 | package main 496 | 497 | import "math/rand" 498 | 499 | func main() { 500 | var c int 501 | for i := 0; i < 1024 * 1024 * 1024; i++ { 502 | c = rand.Int() * rand.Int() 503 | c = c * rand.Int() 504 | } 505 | } 506 | ``` 507 | 508 | ## 代码实例 509 | 510 | ### 自定义排序 511 | ```golang 512 | // 自定义排序方式 513 | sort.Sort(byCreationTimestamp(terminatedPods)) 514 | ... 515 | // byCreationTimestamp sorts a list by creation timestamp, using their names as a tie breaker. 516 | type byCreationTimestamp []*v1.Pod 517 | 518 | func (o byCreationTimestamp) Len() int { return len(o) } 519 | func (o byCreationTimestamp) Swap(i, j int) { o[i], o[j] = o[j], o[i] } 520 | 521 | func (o byCreationTimestamp) Less(i, j int) bool { 522 | if o[i].CreationTimestamp.Equal(&o[j].CreationTimestamp) { 523 | return o[i].Name < o[j].Name 524 | } 525 | return o[i].CreationTimestamp.Before(&o[j].CreationTimestamp) 526 | } 527 | ``` 528 | 529 | ### 在多个地址/端口上监听 530 | ```golang 531 | package main 532 | 533 | import ( 534 | "net/http" 535 | ) 536 | 537 | func index(rw http.ResponseWriter, req *http.Request) { 538 | rw.Write([]byte("hello world")) 539 | } 540 | 541 | func main() { 542 | http.HandleFunc("/", index) 543 | go http.ListenAndServe("127.0.0.1:1555", nil) 544 | go http.ListenAndServe("127.0.0.1:1666", nil) 545 | select {} 546 | } 547 | ``` 548 | 549 | ### 为mutex锁设置超时 550 | ```golang 551 | package main 552 | 553 | import ( 554 | "context" 555 | "fmt" 556 | "sync" 557 | "time" 558 | ) 559 | 560 | func main() { 561 | var ml sync.Mutex 562 | go func() { 563 | ml.Lock() 564 | fmt.Printf("other one is holding the lock...\n") 565 | }() 566 | 567 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 568 | defer cancel() 569 | 570 | var ch chan struct{} 571 | 572 | go func() { 573 | ml.Lock() 574 | ch <- struct{}{} 575 | }() 576 | 577 | select { 578 | case <-ctx.Done(): 579 | fmt.Printf("lock failed: timeout\n") 580 | case <-ch: 581 | fmt.Printf("lock success\n") 582 | } 583 | 584 | return 585 | } 586 | ``` 587 | -------------------------------------------------------------------------------- /docs/kubernetes.md: -------------------------------------------------------------------------------- 1 | # TOC 2 | 3 | 4 | * [TOC](#toc) 5 | * [集群控制面高可用方案](#集群控制面高可用方案) 6 | * [多实例leader选举](#多实例leader选举) 7 | * [Pod健康和就绪检查遇到的坑](#pod健康和就绪检查遇到的坑) 8 | * [问题描述](#问题描述) 9 | * [结论](#结论) 10 | * [分析](#分析) 11 | * [其它](#其它) 12 | * [Kubernetes高级调度特性](#kubernetes高级调度特性) 13 | * [亲和性](#亲和性) 14 | * [配置示例](#配置示例) 15 | * [自定义调度器](#自定义调度器) 16 | * [API优先级APIPriorityAndFairness](#api优先级apipriorityandfairness) 17 | * [以CRD方式扩展API](#以crd方式扩展api) 18 | * [Pod调度如何感知volume的topology](#pod调度如何感知volume的topology) 19 | * [CPU资源高级管理](#cpu资源高级管理) 20 | * [kube-proxy集群内负载均衡](#kube-proxy集群内负载均衡) 21 | * [深入iptables模式的kube-proxy](#深入iptables模式的kube-proxy) 22 | * [实现会话亲和性](#实现会话亲和性) 23 | * [域名解析和DNS策略](#域名解析和dns策略) 24 | * [Pod's DNS Policy](#pods-dns-policy) 25 | * [对象名称和字符串格式检查](#对象名称和字符串格式检查) 26 | * [kubectl插件](#kubectl插件) 27 | * [认证Authentication](#认证authentication) 28 | * [账号](#账号) 29 | * [Kubernetes用户](#kubernetes用户) 30 | * [服务账号Service Account](#服务账号service-account) 31 | * [证书用户User](#证书用户user) 32 | * [如何创建一个证书用户](#如何创建一个证书用户) 33 | * [通过webhook对接外部认证提供商](#通过webhook对接外部认证提供商) 34 | * [到达聚合apiserver的请求中如何携带用户信息](#到达聚合apiserver的请求中如何携带用户信息) 35 | * [TODO](#todo) 36 | * [鉴权Authorization](#鉴权authorization) 37 | * [判断我是否有权限](#判断我是否有权限) 38 | * [判断谁有权限操作](#判断谁有权限操作) 39 | * [常见操作](#常见操作) 40 | * [安全](#安全) 41 | * [Pod Security Admission](#pod-security-admission) 42 | * [配置container Capabilities](#配置container-capabilities) 43 | * [Kubernetes对接容器安全](#kubernetes对接容器安全) 44 | * [CRI接口中LinuxContainerSecurityContext](#cri接口中linuxcontainersecuritycontext) 45 | * [OCI接口中LinuxDeviceCgroup](#oci接口中linuxdevicecgroup) 46 | * [操作实例](#操作实例) 47 | * [大规模集群实践](#大规模集群实践) 48 | * [社区优化跟踪](#社区优化跟踪) 49 | * [在大规模集群中优雅的操作](#在大规模集群中优雅的操作) 50 | * [集群Pod总数](#集群pod总数) 51 | * [集群Event总数](#集群event总数) 52 | * [筛选慢操作list all](#筛选慢操作list-all) 53 | * [筛选出最早创建的一组pod(用于onDelete策略的更新)](#筛选出最早创建的一组pod用于ondelete策略的更新) 54 | * [节点维护](#节点维护) 55 | * [便捷操作](#便捷操作) 56 | * [event使用独立的etcd集群](#event使用独立的etcd集群) 57 | * [模拟list对kube-apiserver进行压测](#模拟list对kube-apiserver进行压测) 58 | * [获取openapi json](#获取openapi-json) 59 | * [从secret中获取证书信息](#从secret中获取证书信息) 60 | * [从KubeConfig文件中提取证书秘钥](#从kubeconfig文件中提取证书秘钥) 61 | * [堆栈文件分析](#堆栈文件分析) 62 | * [根据sa生成kubeconfig](#根据sa生成kubeconfig) 63 | * [kubeconfig跳过服务端证书校验](#kubeconfig跳过服务端证书校验) 64 | * [定制kubectl输出](#定制kubectl输出) 65 | * [kubectl patch操作](#kubectl-patch操作) 66 | * [常见操作](#常见操作-1) 67 | * [资源遍历](#资源遍历) 68 | * [遍历列出所有的资源类型及支持的操作](#遍历列出所有的资源类型及支持的操作) 69 | * [遍历所有pod](#遍历所有pod) 70 | * [遍历所有pod及其容器](#遍历所有pod及其容器) 71 | * [遍历所有工作负载](#遍历所有工作负载) 72 | * [遍历一个命名空间下所有资源](#遍历一个命名空间下所有资源) 73 | * [遍历一个命名空间下所有资源的label和annotations](#遍历一个命名空间下所有资源的label和annotations) 74 | * [遍历所有区分命名空间的资源的内容](#遍历所有区分命名空间的资源的内容) 75 | * [遍历所有跨命名空间的资源](#遍历所有跨命名空间的资源) 76 | * [遍历所有跨命名空间的资源的label和annotations](#遍历所有跨命名空间的资源的label和annotations) 77 | * [遍历所有跨命名空间的资源的内容](#遍历所有跨命名空间的资源的内容) 78 | * [遍历所有pod的cpu request配置](#遍历所有pod的cpu-request配置) 79 | * [客户端访问集群时context配置](#客户端访问集群时context配置) 80 | * [ConfigMap使用](#configmap使用) 81 | * [日志相关配置](#日志相关配置) 82 | * [提升集群HA性能](#提升集群ha性能) 83 | * [强制删除Pod](#强制删除pod) 84 | * [Pod中获取PodIP的方法](#pod中获取podip的方法) 85 | * [emptyDir在宿主机上的路径](#emptydir在宿主机上的路径) 86 | * [节点上emptyDir用量统计](#节点上emptydir用量统计) 87 | * [远程到节点统计emptyDir用量](#远程到节点统计emptydir用量) 88 | * [FC存储多路径的PV配置](#fc存储多路径的pv配置) 89 | * [编译kubelet](#编译kubelet) 90 | * [获取k8s控制面组件指标](#获取k8s控制面组件指标) 91 | * [kubeadm部署的集群的操作](#kubeadm部署的集群的操作) 92 | * [kube-apiserver内部本地访问客户端](#kube-apiserver内部本地访问客户端) 93 | * [读取 kubelet_internal_checkpoint](#读取-kubeletinternalcheckpoint) 94 | * [最佳实践](#最佳实践) 95 | * [使用finalizers拦截资源删除](#使用finalizers拦截资源删除) 96 | * [手动清理finalizers](#手动清理finalizers) 97 | * [资源限制](#资源限制) 98 | * [容器进程数限制pids](#容器进程数限制pids) 99 | * [HPA](#hpa) 100 | * [集群内通过svc访问外部服务](#集群内通过svc访问外部服务) 101 | * [性能调优](#性能调优) 102 | * [读懂监控指标](#读懂监控指标) 103 | * [etcd监控指标](#etcd监控指标) 104 | * [kube-apiserver监控指标](#kube-apiserver监控指标) 105 | * [kube-controller-manager监控指标](#kube-controller-manager监控指标) 106 | * [kube-scheduler监控指标](#kube-scheduler监控指标) 107 | * [kubelet监控指标](#kubelet监控指标) 108 | * [内存优化](#内存优化) 109 | * [查看defaultCpuSet核上CPU使用量](#查看defaultcpuset核上cpu使用量) 110 | * [Deep Dive系列](#deep-dive系列) 111 | * [kube-apiserver](#kube-apiserver) 112 | * [服务启动流程](#服务启动流程) 113 | * [服务端fieldSelector](#服务端fieldselector) 114 | * [REST Storage](#rest-storage) 115 | * [安装API及其REST Storage](#安装api及其rest-storage) 116 | * [API定义和版本](#api定义和版本) 117 | * [序列化和反序列化](#序列化和反序列化) 118 | * [TypeMeta的反序列化](#typemeta的反序列化) 119 | * [外部版本的序列化和反序列化](#外部版本的序列化和反序列化) 120 | * [codec和codec factory](#codec和codec-factory) 121 | * [资源schema](#资源schema) 122 | * [健康检查/healthz](#健康检查healthz) 123 | * [就绪检查/readyz](#就绪检查readyz) 124 | * [node authorizer实现](#node-authorizer实现) 125 | * [kube-controller-manager](#kube-controller-manager) 126 | * [配置和初始化](#配置和初始化) 127 | * [leader选举](#leader选举) 128 | * [核心Controller](#核心controller) 129 | * [kube-scheduler](#kube-scheduler) 130 | * [配置和初始化](#配置和初始化-1) 131 | * [leader选举](#leader选举-1) 132 | * [资源调度](#资源调度) 133 | * [kubelet](#kubelet) 134 | * [配置和初始化](#配置和初始化-2) 135 | * [PLEG](#pleg) 136 | * [调用CRI接口](#调用cri接口) 137 | * [(间接)通过CNI接口管理网络](#间接通过cni接口管理网络) 138 | * [通过CSI管理存储](#通过csi管理存储) 139 | * [设备和资源管理](#设备和资源管理) 140 | * [资源计算和预留](#资源计算和预留) 141 | * [为容器进程设置oom_score_adj](#为容器进程设置oomscoreadj) 142 | * [Topology Manager](#topology-manager) 143 | * [CPU Manager](#cpu-manager) 144 | * [遍历所有Pod的cpuset配置](#遍历所有pod的cpuset配置) 145 | * [Memory Manager](#memory-manager) 146 | * [Device Manager](#device-manager) 147 | * [节点优雅关机 GracefulNodeShutdown](#节点优雅关机-gracefulnodeshutdown) 148 | * [库函数和实操](#库函数和实操) 149 | * [特性门featuregate](#特性门featuregate) 150 | * [处理runtime.Object](#处理runtimeobject) 151 | * [获取meta.Object信息](#获取metaobject信息) 152 | * [Debug](#debug) 153 | * [kube-apiserver](#kube-apiserver-1) 154 | * [kubelet](#kubelet-1) 155 | * [kube-controller-manager](#kube-controller-manager-1) 156 | * [kube-scheduler](#kube-scheduler-1) 157 | * [备忘](#备忘) 158 | * [k8s版本信息](#k8s版本信息) 159 | * [从源码编译kubernetes时版本信息](#从源码编译kubernetes时版本信息) 160 | * [修改结构体定义后更新api-rules校验](#修改结构体定义后更新api-rules校验) 161 | * [构建时如何选取version](#构建时如何选取version) 162 | * [StatefulSet无法更新中volumeClaimTemplates的request](#statefulset无法更新中volumeclaimtemplates的request) 163 | * [其它](#其它-1) 164 | 165 | 166 | # 集群控制面高可用方案 167 | TODO 168 | kubernetes的组件,例如apiserver、controller、scheduler、kube-dns在配置时,均能指定多个server,使用failover方式保证高可用。 169 | 以apiserver为例,帮助信息中有: 170 | ```bash 171 | --etcd-servers=[]: List of etcd servers to connect with (http://ip:port), comma separated. 172 | ``` 173 | 通过--etcd-servers指定多个etcd服务器,apiserver能failover方式访问这些服务。 174 | 175 | # 多实例leader选举 176 | 客户端代码路径: 177 | k8s.io/kubernetes/pkg/client/leaderelection/leaderelection.go 178 | 179 | 180 | # Pod健康和就绪检查遇到的坑 181 | 182 | ## 问题描述 183 | Pod进行健康和就绪检查配置中,发现某些已有健康检查的服务,在增加就绪检查后Pod一直不就绪,且健康检查也出问题。如下健康检查为例 184 | ```bash 185 | livenessProbe: 186 | httpGet: 187 | host: 127.0.0.1 188 | path: / 189 | port: 9311 190 | initialDelaySeconds: 600 191 | periodSeconds: 60 192 | timeoutSeconds: 30 193 | ``` 194 | Pod正常工作。再增加就绪检查: 195 | ```bash 196 | readinessProbe: 197 | httpGet: 198 | host: 127.0.0.1 199 | path: / 200 | port: 9311 201 | initialDelaySeconds: 5 202 | periodSeconds: 30 203 | timeoutSeconds: 25 204 | ``` 205 | 以后,Pod一直未能就绪,且因健康检查失败而反复重启。 206 | 207 | ## 结论 208 | 209 | **检查方法httpGet在容器外执行,强烈建议不要指定host(除非知晓其中的风险)** 210 | httpGet检查在容器外执行,但其行为表现严重受到host影响: 211 | - 指定有host时,httpGet访问该host上的相应端口,若host指定为127.0.0.1,则访问节点本地的服务端口,外在表现为“容器外执行” 212 | - 未指定host时,httpGet默认访问该Pod(Pod IP)上相应端口,在容器网络(例如flannel、kube-proxy)中该请求直接转发到容器中,外在表现是访问容器内部端口、在“容器内执行”。 213 | 214 | **检查方法tcpSocket在容器外执行,但不支持指定host,请求直接转发到容器中** 215 | tcpSocket检查无法指定host,直接同该Pod(Pod IP)上相应端口建立连接,该连接直接转发到容器中,因此外在表现是访问容器内部端口、在“容器内执行”。 216 | 217 | **检查方法exec在容器内执行** 218 | exec检查指定的操作在容器内执行。 219 | 220 | 221 | ## 分析 222 | 223 | 参见代码`kubernetes/kubernetes/pkg/kubelet/prober/prober.go`。 224 | 225 | 就着结论,我们来分析为什么会出现上述问题中的表现。 226 | 227 | 仅配置健康检查时,指定host为127.0.0.1,其实访问节点本地的9311端口。目前,大多数服务将容器内部端口通过nodePort方式暴露到节点上,且nodePort端口同容器内部端口保持一致,健康检查能通过如下流程顺利执行httpGet操作 228 | > kubelet的Probe模块(容器外)发起HTTP请求 -> kube-proxy的nodePort -> 容器内targetPort ->容器内服务。 229 | 230 | 当加入就绪检查后情况发生变化。就绪检查中指定host为127.0.0.1,由于Pod还未就绪、Service没有可用的Endpoint,访问节点本地9311端口时失败,pod则一直不就绪。相应的,其健康检查也无法访问节点本地9311端口,导致健康检查失败、Pod反复重启。 231 | 232 | 解决办法在于去掉健康和就绪检查中的host配置,使httpGet请求发送到Pod内,不再依赖节点上nodePort暴露的服务。 233 | 234 | 235 | ## 其它 236 | 某些服务配置了host过滤,仅支持访问指定host,在健康和就绪检查的httpGet中增加如下配置即可: 237 | ```bash 238 | httpGet: 239 | httpHeaders: 240 | - name: Host 241 | value: ${ALLOWED_HOST} 242 | path: / 243 | port: 9311 244 | scheme: HTTP 245 | ``` 246 | 健康和就绪检查中增加HTTP扩展头部`Host: ${ALLOWED_HOST}`,其中`${ALLOWED_HOST}`是服务中配置的host过滤中允许访问的host。 247 | 248 | 249 | # Kubernetes高级调度特性 250 | 为Pending状态的Pod选取一个 **合适** 的Node去运行,是Kubernetes调度的唯一目的。该目的简单、明确,但最重要也是最麻烦的在于 **“合适”** 两字。 251 | 除了默认调度器(`default kubernetes scheduler`),Kubernetes高级调度特性(`Advanced Scheduling`)引入了更加灵活的策略,以应对复杂多样的业务需求。 252 | 253 | ## 亲和性 254 | 设想有一个Pending状态等待调度的Pod,尝试用Kubernetes高级调度特性-亲和性,找到最优解时,需要考虑如下几方面的内容: 255 | | 亲和对象 | 亲和类型 | 策略 | 运算符 | 256 | | --- | --- | --- | --- | 257 | | Node
Pod | 亲和(affinity)
反亲和(anti-affinity) | requiredDuringSchedulingIgnoredDuringExecution
requiredDuringSchedulingRequiredDuringExecution
preferredDuringSchedulingIgnoredDuringExecution | In/NotIn
Exists/DoesNotExists
Gt/Lt | 258 | 259 | ### 配置示例 260 | 当有worker时,优先调度到worker上,否则调度到master上: 261 | ```yaml 262 | affinity: 263 | nodeAffinity: 264 | requiredDuringSchedulingIgnoredDuringExecution: 265 | nodeSelectorTerms: 266 | - matchExpressions: 267 | - key: node-role.kubernetes.io/worker 268 | operator: Exists 269 | - matchExpressions: 270 | - key: node-role.kubernetes.io/master 271 | operator: Exists 272 | preferredDuringSchedulingIgnoredDuringExecution: 273 | - weight: 100 274 | preference: 275 | matchExpressions: 276 | - key: node-role.kubernetes.io/worker 277 | operator: Exists 278 | - weight: 1 279 | preference: 280 | matchExpressions: 281 | - key: node-role.kubernetes.io/master 282 | operator: Exists 283 | tolerations: 284 | - effect: NoSchedule 285 | key: node-role.kubernetes.io/master 286 | operator: Exists 287 | ``` 288 | 289 | ## 自定义调度器 290 | custom scheduler,通过Bash脚本实现自定义调度器示例 291 | ```bash 292 | #!/bin/bash 293 | KUBECTL='xxx' 294 | SERVER='xxx' 295 | MYSQL_POD_NAME='mysql-node' 296 | 297 | function find_mysql_master_node() 298 | { 299 | MYSQL_PODS=$($KUBECTL --server $SERVER get pod -o wide | grep $MYSQL_POD_NAME | grep Running | awk '{print $6,$7}') 300 | IFS=' ' read -r -a MYSQL_PODS <<< $MYSQL_PODS 301 | for ((i=0;i<${#MYSQL_PODS[@]};i+=2)); 302 | do 303 | podip=${MYSQL_PODS[i]} 304 | nodeip=${MYSQL_PODS[i+1]} 305 | result=$(mysql -uroot -ppassword -h${podip} --connect-timeout=5 -e 'show slave hosts;') 306 | if [ -n "$result" ]; then 307 | echo $nodeip 308 | return 309 | fi 310 | done 311 | echo null 312 | return 313 | } 314 | function find_k8s_master_node() 315 | { 316 | NODES=$($KUBECTL --server $SERVER get node | grep -v NAME | awk '{print $1}') 317 | for i in ${NODES}; 318 | do 319 | result=$(ssh root@${i} ps -ef | grep kube-controller | grep -v grep) 320 | if [ -n "$result" ]; then 321 | echo ${i} 322 | return 323 | fi 324 | done 325 | echo null 326 | return 327 | } 328 | while true; 329 | do 330 | for POD in $($KUBECTL --server $SERVER get pod -o json | jq '.items[] | select(.spec.schedulerName == "smart-scheduler") | 331 | select(.spec.nodeName == null) | select(.status.phase == "Pending") | .metadata.name' | tr -d '"'); 332 | do 333 | NODES=$($KUBECTL --server $SERVER get node | grep Ready | awk '{print $1}') 334 | MYSQL_MNODE=$(find_mysql_master_node) 335 | K8S_MNODE=$(find_k8s_master_node) 336 | for NODE in ${NODES}; 337 | do 338 | if [ ${NODE} != ${MYSQL_MNODE} ]; then 339 | if [ ${NODE} != ${K8S_MNODE} ]; then 340 | curl --header "Content-Type:application/json" \ 341 | --request POST \ 342 | --data '{"apiVersion":"v1", "kind": "Binding", "metadata": {"name": "'$POD'"}, 343 | "target": {"apiVersion": "v1", "kind": "Node", "name": "'$NODE'"}}' \ 344 | http://$SERVER/api/v1/namespaces/default/pods/$POD/binding/ #1>/dev/null 2>&1 345 | echo "Assigned ${POD} to ${NODE}, bypass mysql master ${MYSQL_MNODE} and k8s master ${K8S_MNODE}" 346 | fi 347 | fi 348 | done 349 | done 350 | #echo mysql $(find_mysql_master_node) 351 | #echo k8s $(find_k8s_master_node) 352 | sleep 2 353 | done 354 | ``` 355 | 356 | 要使用上述自定义调度器,工作负载配置`schedulerName: smart-scheduler`。 357 | 自定义调度器就是一个“controller”,不停的“reconcile”。 358 | 359 | 360 | # API优先级APIPriorityAndFairness 361 | ```bash 362 | # https://www.yisu.com/zixun/523074.html 363 | kubectl get --raw /debug/api_priority_and_fairness/dump_priority_levels 364 | ``` 365 | 366 | # 以CRD方式扩展API 367 | https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/ 368 | 369 | # Pod调度如何感知volume的topology 370 | 环境中有三个节点,类型为Controller: 371 | ```bash 372 | [root@zy-m224 hehe]# kubectl get node -l nodeType=controller 373 | NAME STATUS ROLES AGE VERSION 374 | zy-m224 Ready compute,infra,master 1d v1.11.0+d4cacc0 375 | zy-s222 Ready compute,infra,master 1d v1.11.0+d4cacc0 376 | zy-s223 Ready compute,infra,master 1d v1.11.0+d4cacc0 377 | ``` 378 | 379 | 创建`storageclass`为ha-low的pvc,其存在两个副本: 380 | ```bash 381 | [root@zy-m224 hehe]# kubectl get sc ha-low -o yaml --export 382 | apiVersion: storage.k8s.io/v1 383 | kind: StorageClass 384 | metadata: 385 | annotations: 386 | storage.alpha.openshift.io/access-mode: ReadWriteOnce 387 | creationTimestamp: null 388 | name: ha-low 389 | selfLink: /apis/storage.k8s.io/v1/storageclasses/ha-low 390 | parameters: 391 | fstype: ext4 392 | replicas: "2" 393 | selector: beta.kubernetes.io/arch=amd64,beta.kubernetes.io/os=linux,nodeType=controller 394 | provisioner: ctriple.cn/drbd 395 | reclaimPolicy: Retain 396 | volumeBindingMode: Immediate 397 | ``` 398 | 399 | 自动部署的pv和底层存储被调度到`zy-m224`和`zy-s222`节点: 400 | ```bash 401 | [root@zy-m224 hehe]# kubectl get pvc test-pvc 402 | NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE 403 | test-pvc Bound r0005 1Gi RWO ha-low 46m 404 | [root@zy-m224 hehe]# kubectl get pv r0005 405 | NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE 406 | r0005 1Gi RWO Retain Bound default/test-pvc ha-low 46m 407 | [root@zy-m224 hehe]# ansible controller -m shell -a "lvs | grep r0005" 408 | zy-s223 | FAILED | rc=1 >> 409 | non-zero return code 410 | 411 | zy-m224 | SUCCESS | rc=0 >> 412 | r0005 centos -wi-ao---- 1.00g 413 | 414 | zy-s222 | SUCCESS | rc=0 >> 415 | r0005 centos -wi-ao---- 1.00g 416 | ``` 417 | 418 | 让pod,使用该pvc后,反复删除、重启pod,发现该pod只会调度到`zy-m224`和`zy-s222`节点: 419 | ```bash 420 | [root@zy-m224 hehe]# pod | grep wechat 421 | default wechat-874jj 1/1 Running 0 8m 10.242.0.142 zy-m224 422 | ``` 423 | 424 | 修改rc/wechat,将其绑定到错误的节点`zy-s223`: 425 | ```bash 426 | ... 427 | hostname: wechat 428 | nodeSelector: 429 | node: node3 430 | nodeType: controller 431 | ... 432 | ``` 433 | 434 | 删除pod后重新调度,一直处于`Pending`状态,并报`volume node affinity conflict`: 435 | ```bash 436 | [root@zy-m224 scripts]# kubectl describe pod wechat-82z6q 437 | ... 438 | Events: 439 | Type Reason Age From Message 440 | ---- ------ ---- ---- ------- 441 | Warning FailedScheduling 3m (x25 over 3m) default-scheduler 0/4 nodes are available: 1 node(s) had volume node affinity conflict, 3 node(s) didn't match node selector. 442 | ``` 443 | 444 | 来龙去脉大致如下:kube-scheduler调度pod时,检查其绑定的volume,顺着pvc->pv,发现pv有配置`nodeAffinity`: 445 | ```bash 446 | apiVersion: v1 447 | kind: PersistentVolume 448 | metadata: 449 | ... 450 | spec: 451 | ... 452 | nodeAffinity: 453 | required: 454 | nodeSelectorTerms: 455 | - matchExpressions: 456 | - key: kubernetes.io/hostname 457 | operator: In 458 | values: 459 | - zy-m224 460 | - zy-s222 461 | persistentVolumeReclaimPolicy: Retain 462 | storageClassName: ha-low 463 | status: 464 | phase: Bound 465 | ``` 466 | 467 | 阅读更多: 468 | - [VOLUME TOPOLOGY-AWARE SCHEDULING](https://stupefied-goodall-e282f7.netlify.com/contributors/design-proposals/storage/volume-topology-scheduling/) 469 | 470 | 471 | # CPU资源高级管理 472 | TODO 473 | - https://docs.openshift.com/container-platform/3.11/scaling_performance/using_cpu_manager.html 474 | - https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/ 475 | 476 | # kube-proxy集群内负载均衡 477 | 作为K8s集群内默认负载均衡解决方案,kube-proxy支持模式方式: 478 | * [ipvs](https://kubernetes.io/blog/2018/07/09/ipvs-based-in-cluster-load-balancing-deep-dive/),未来发展方向 479 | * [iptables](https://kubernetes.io/docs/concepts/services-networking/service/),默认方式 480 | * [user-space](https://kubernetes.io/docs/concepts/services-networking/service/),已逐渐被淘汰 481 | 482 | ## 深入iptables模式的kube-proxy 483 | 484 | ### 实现会话亲和性 485 | 开启会话亲和性,`sessionAffinity: ClientIP`时,iptables规则: 486 | ```bash 487 | :KUBE-SEP-2ZNXFH2VOSGBPAVV - [0:0] 488 | :KUBE-SEP-G2V5AWNNIXO6IYNV - [0:0] 489 | :KUBE-SEP-SRB22U7BNHNW5WLR - [0:0] 490 | :KUBE-SVC-TYE23RAXJNHAJ33G - [0:0] 491 | -A KUBE-NODEPORTS -p tcp -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m tcp --dport 13332 -j KUBE-MARK-MASQ 492 | -A KUBE-NODEPORTS -p tcp -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m tcp --dport 13332 -j KUBE-SVC-TYE23RAXJNHAJ33G 493 | -A KUBE-SEP-2ZNXFH2VOSGBPAVV -s 10.244.1.31/32 -m comment --comment "space22pbugsd/yibao-b:yibao-b" -j KUBE-MARK-MASQ 494 | -A KUBE-SEP-2ZNXFH2VOSGBPAVV -p tcp -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m recent --set --name KUBE-SEP-2ZNXFH2VOSGBPAVV --mask 255.255.255.255 --rsource -m tcp -j DNAT --to-destination 10.244.1.31:13332 495 | -A KUBE-SEP-G2V5AWNNIXO6IYNV -s 10.246.0.133/32 -m comment --comment "space22pbugsd/yibao-b:yibao-b" -j KUBE-MARK-MASQ 496 | -A KUBE-SEP-G2V5AWNNIXO6IYNV -p tcp -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m recent --set --name KUBE-SEP-G2V5AWNNIXO6IYNV --mask 255.255.255.255 --rsource -m tcp -j DNAT --to-destination 10.246.0.133:13332 497 | -A KUBE-SEP-SRB22U7BNHNW5WLR -s 10.243.1.179/32 -m comment --comment "space22pbugsd/yibao-b:yibao-b" -j KUBE-MARK-MASQ 498 | -A KUBE-SEP-SRB22U7BNHNW5WLR -p tcp -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m recent --set --name KUBE-SEP-SRB22U7BNHNW5WLR --mask 255.255.255.255 --rsource -m tcp -j DNAT --to-destination 10.243.1.179:13332 499 | -A KUBE-SERVICES ! -s 10.240.0.0/12 -d 10.100.218.244/32 -p tcp -m comment --comment "space22pbugsd/yibao-b:yibao-b cluster IP" -m tcp --dport 13332 -j KUBE-MARK-MASQ 500 | -A KUBE-SERVICES -d 10.100.218.244/32 -p tcp -m comment --comment "space22pbugsd/yibao-b:yibao-b cluster IP" -m tcp --dport 13332 -j KUBE-SVC-TYE23RAXJNHAJ33G 501 | -A KUBE-SVC-TYE23RAXJNHAJ33G -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m recent --rcheck --seconds 10800 --reap --name KUBE-SEP-SRB22U7BNHNW5WLR --mask 255.255.255.255 --rsource -j KUBE-SEP-SRB22U7BNHNW5WLR 502 | -A KUBE-SVC-TYE23RAXJNHAJ33G -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m recent --rcheck --seconds 10800 --reap --name KUBE-SEP-2ZNXFH2VOSGBPAVV --mask 255.255.255.255 --rsource -j KUBE-SEP-2ZNXFH2VOSGBPAVV 503 | -A KUBE-SVC-TYE23RAXJNHAJ33G -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m recent --rcheck --seconds 10800 --reap --name KUBE-SEP-G2V5AWNNIXO6IYNV --mask 255.255.255.255 --rsource -j KUBE-SEP-G2V5AWNNIXO6IYNV 504 | -A KUBE-SVC-TYE23RAXJNHAJ33G -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m statistic --mode random --probability 0.33332999982 -j KUBE-SEP-SRB22U7BNHNW5WLR 505 | -A KUBE-SVC-TYE23RAXJNHAJ33G -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m statistic --mode random --probability 0.50000000000 -j KUBE-SEP-2ZNXFH2VOSGBPAVV 506 | -A KUBE-SVC-TYE23RAXJNHAJ33G -m comment --comment "space22pbugsd/yibao-b:yibao-b" -j KUBE-SEP-G2V5AWNNIXO6IYNV 507 | ``` 508 | 通过`recent`模块实现会话亲和性。 509 | 510 | 关闭会话亲和性,`sessionAffinity: None`时,iptables规则: 511 | ```bash 512 | :KUBE-SEP-2ZNXFH2VOSGBPAVV - [0:0] 513 | :KUBE-SEP-G2V5AWNNIXO6IYNV - [0:0] 514 | :KUBE-SEP-SRB22U7BNHNW5WLR - [0:0] 515 | :KUBE-SVC-TYE23RAXJNHAJ33G - [0:0] 516 | -A KUBE-NODEPORTS -p tcp -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m tcp --dport 13332 -j KUBE-MARK-MASQ 517 | -A KUBE-NODEPORTS -p tcp -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m tcp --dport 13332 -j KUBE-SVC-TYE23RAXJNHAJ33G 518 | -A KUBE-SEP-2ZNXFH2VOSGBPAVV -s 10.244.1.31/32 -m comment --comment "space22pbugsd/yibao-b:yibao-b" -j KUBE-MARK-MASQ 519 | -A KUBE-SEP-2ZNXFH2VOSGBPAVV -p tcp -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m tcp -j DNAT --to-destination 10.244.1.31:13332 520 | -A KUBE-SEP-G2V5AWNNIXO6IYNV -s 10.246.0.133/32 -m comment --comment "space22pbugsd/yibao-b:yibao-b" -j KUBE-MARK-MASQ 521 | -A KUBE-SEP-G2V5AWNNIXO6IYNV -p tcp -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m tcp -j DNAT --to-destination 10.246.0.133:13332 522 | -A KUBE-SEP-SRB22U7BNHNW5WLR -s 10.243.1.179/32 -m comment --comment "space22pbugsd/yibao-b:yibao-b" -j KUBE-MARK-MASQ 523 | -A KUBE-SEP-SRB22U7BNHNW5WLR -p tcp -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m tcp -j DNAT --to-destination 10.243.1.179:13332 524 | -A KUBE-SERVICES ! -s 10.240.0.0/12 -d 10.100.218.244/32 -p tcp -m comment --comment "space22pbugsd/yibao-b:yibao-b cluster IP" -m tcp --dport 13332 -j KUBE-MARK-MASQ 525 | -A KUBE-SERVICES -d 10.100.218.244/32 -p tcp -m comment --comment "space22pbugsd/yibao-b:yibao-b cluster IP" -m tcp --dport 13332 -j KUBE-SVC-TYE23RAXJNHAJ33G 526 | -A KUBE-SVC-TYE23RAXJNHAJ33G -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m statistic --mode random --probability 0.33332999982 -j KUBE-SEP-SRB22U7BNHNW5WLR 527 | -A KUBE-SVC-TYE23RAXJNHAJ33G -m comment --comment "space22pbugsd/yibao-b:yibao-b" -m statistic --mode random --probability 0.50000000000 -j KUBE-SEP-2ZNXFH2VOSGBPAVV 528 | -A KUBE-SVC-TYE23RAXJNHAJ33G -m comment --comment "space22pbugsd/yibao-b:yibao-b" -j KUBE-SEP-G2V5AWNNIXO6IYNV 529 | ``` 530 | 531 | 532 | # 域名解析和DNS策略 533 | 534 | ## Pod's DNS Policy 535 | 参见[Pod’s DNS Policy](https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#pod-s-dns-policy) 536 | 537 | # 对象名称和字符串格式检查 538 | 参见[Object Names and IDs](https://kubernetes.io/docs/concepts/overview/working-with-objects/names/),Kubernetes中绝大多数对象名称需符合[RFC 1123](https://tools.ietf.org/html/rfc1123)要求,具体的: 539 | * contain no more than 253 characters 540 | * contain only lowercase alphanumeric characters, ‘-’ or ‘.’ 541 | * start with an alphanumeric character 542 | * end with an alphanumeric character 543 | 544 | 其对应正则表达式为 545 | ```bash 546 | '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*' 547 | ``` 548 | 549 | 标签Label的key合法格式 550 | > Valid label keys have two segments: an optional prefix and name, separated by a slash (/). 551 | > The name segment is required and must be 63 characters or less, beginning and ending with an alphanumeric character ([a-z0-9A-Z]) with dashes (-), underscores (_), dots (.), and alphanumerics between. 552 | > The prefix is optional. If specified, the prefix must be a DNS subdomain: a series of DNS labels separated by dots (.), not longer than 253 characters in total, followed by a slash (/). 553 | > Valid label values must be 63 characters or less and must be empty or begin and end with an alphanumeric character ([a-z0-9A-Z]) with dashes (-), underscores (_), dots (.), and alphanumerics between. 554 | 555 | 标签Label的value合法格式 556 | > Valid label values must be 63 characters or less and must be empty or begin and end with an alphanumeric character ([a-z0-9A-Z]) with dashes (-), underscores (_), dots (.), and alphanumerics between. 557 | 558 | 参见[Labels and Selectors](https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/) 559 | 560 | 就文件名filename而言,参照[The POSIX portable file name character set](https://www.ibm.com/support/knowledgecenter/en/SSLTBW_2.2.0/com.ibm.zos.v2r2.bpxa400/bpxug469.htm): 561 | * Uppercase A to Z 562 | * Lowercase a to z 563 | * Numbers 0 to 9 564 | * Period (.) 565 | * Underscore (_) 566 | * Hyphen (-) 567 | 568 | 569 | 更详细的检查,请参见`https://github.com/kubernetes/kubernetes/blob/master/pkg/apis/core/validation/validation.go`。 570 | 571 | 572 | # kubectl插件 573 | 摘自 [kubectl overview](https://kubernetes.io/docs/reference/kubectl/overview/) 574 | 575 | 只要在`PATH`路径下创建以`kubectl-`开头的可执行文件,即可被`kubectl`识别,并作为插件进行集成使用。如下以`kubectl whoami`为例说明。 576 | 577 | 首先,创建`/usr/local/bin/kubectl-whoami`文件,其内容如下: 578 | ```bash 579 | #!/bin/bash 580 | 581 | # this plugin makes use of the `kubectl config` command in order to output 582 | # information about the current user, based on the currently selected context 583 | kubectl config view --template='{{ range .contexts }}{{ if eq .name "'$(kubectl config current-context)'" }}Current user: {{ printf "%s\n" .context.user }}{{ end }}{{ end }}' 584 | ``` 585 | 586 | 然后,将其设置为可执行: 587 | ```bash 588 | # chmod a+x /usr/local/bin/kubectl-whoami 589 | ``` 590 | 591 | 最后,检验: 592 | ```bash 593 | [root@xxx ~]# kubectl plugin list 594 | The following compatible plugins are available: 595 | 596 | /usr/local/bin/kubectl-whoami 597 | [root@xxx ~]# kubectl whoami 598 | Current user: kubernetes-admin 599 | 600 | ``` 601 | 602 | 603 | # 认证Authentication 604 | ## 账号 605 | ### Kubernetes用户 606 | #### 服务账号Service Account 607 | #### 证书用户User 608 | ##### 如何创建一个证书用户 609 | 参见 [certificate-signing-requests](https://kubernetes.io/docs/reference/access-authn-authz/certificate-signing-requests/#normal-user) 610 | 611 | 创建私钥和csr文件: 612 | ```bash 613 | openssl genrsa -out john.key 2048 614 | openssl req -new -key john.key -out john.csr 615 | ``` 616 | 注意,在创建`john.csr`文件时会交互式的输入`CN`和`O`属性,其分别配置了用户名称user name和用户组group。 617 | 618 | 创建K8s资源CertificateSigningRequest: 619 | ```bash 620 | cat <> ${_RESULT_FILE_NAME_} 769 | cat ./kube-apiserver.log* | grep "trace.go:116" | grep -v "ms):$\|etcd3\|cacher list" | sed 's/.*Trace\[.*\]: //g' | grep "/api/v1/events\|/api/v1/nodes,\|/api/v1/pods\|/api/v1/services\|/api/v1/endpoints" | while read -r line 770 | do 771 | verb=$(echo $line | awk '{print $1}' | sed 's/"//g') 772 | url=$(echo $line | grep -Eo "url:.*," | cut -d, -f1 | sed 's/url://g') 773 | agent=$(echo $line | grep -Eo "user-agent:.*," | cut -d, -f1 | sed 's/user-agent://g' | awk '{print $1}') 774 | client=$(echo $line | grep -Eo "client:.* \(s" | awk '{print $1}' | sed 's/client://g') 775 | time=$(echo $line | awk '{print $NF}' | sed 's/)://g') 776 | 777 | echo $verb $url $agent $client 778 | done | sort | uniq -c | sed 's/^[ ]*//g' | tr ' ' ',' >> ${_RESULT_FILE_NAME_} 779 | ``` 780 | 781 | ### 筛选出最早创建的一组pod(用于onDelete策略的更新) 782 | ```bash 783 | STEP=100 784 | kubectl get pod -n foo -l name=bar --sort-by=.status.startTime -owide --no-headers | head -n ${STEP} 785 | ``` 786 | 787 | ## 节点维护 788 | ```bash 789 | # 排干节点 790 | kubectl drain ${node} --delete-emptydir-data --ignore-daemonsets --force 791 | 792 | # 为节点打污点 793 | kubectl taint nodes worker foo:NoSchedule 794 | kubectl taint nodes worker foo=bar:NoExecute 795 | ``` 796 | 797 | ## 便捷操作 798 | * 查找某个节点上带某种注解的pod 799 | ```bash 800 | NODE_NAME=hehe 801 | kubectl get pod -A --field-selector spec.nodeName=$NODE_NAME -o json | jq -r '.items[] | select(.metadata.annotations["foo/bar"] != null) | .metadata | .namespace + " " + .name' 802 | ``` 803 | 804 | * 查询Pod的uid 805 | ```bash 806 | kubectl get pod -A -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name,UID:.metadata.uid 807 | ``` 808 | 809 | * 清理`Completed`状态的Pod 810 | ```bash 811 | kubectl delete pod --field-selector=status.phase==Succeeded --all-namespaces 812 | ``` 813 | 814 | * 清理`Error`状态的Pod 815 | ```bash 816 | kubectl delete pod --field-selector=status.phase==Failed --all-namespaces 817 | ``` 818 | 819 | * 清理`NodeAffinity`状态的Pod 820 | ```bash 821 | kubectl get pod -A -owide | grep NodeAffinity | awk '{print $1" "$2}' | xargs kubectl delete pod -n $1 $2 822 | ``` 823 | 824 | * 找到master节点 825 | ```bash 826 | kubectl get node -l node-role.kubernetes.io/master= -o json | jq '.items[].status.addresses[] | select(.type == "InternalIP") | .address' -r 827 | ``` 828 | 829 | * 找到worker节点,且不是master节点 830 | ```bash 831 | kubectl get node -l node-role.kubernetes.io/worker= -l node-role.kubernetes.io/master!= -o json | jq '.items[].status.addresses[] | select(.type == "InternalIP") | .address' -r 832 | ``` 833 | 834 | * 常用操作别名 835 | ```bash 836 | alias pod='kubectl get pod -o wide -A' 837 | alias svc='kubectl get svc -A' 838 | alias node='kubectl get node -o wide' 839 | alias kc='kubectl' 840 | ``` 841 | 842 | * 统计各节点上Pod数 843 | ```bash 844 | function nodePodCnt { 845 | local tmp_file=$(mktemp) 846 | 847 | kubectl get pod -A -owide --no-headers > ${tmp_file} 848 | if [ $? -eq 0 ]; then 849 | for node in $(cat ${tmp_file} | awk '{print $(NF-2)}' | sort | uniq); do 850 | echo ${node} $(cat ${tmp_file} | grep -cw ${node}) 851 | done 852 | fi 853 | 854 | rm -f ${tmp_file} 855 | } 856 | ``` 857 | 858 | * base64编码的证书信息 859 | ```bash 860 | function b642cert { 861 | local b64=$1 862 | echo $b64 | base64 -d | openssl x509 -noout -text 863 | } 864 | ``` 865 | 866 | ## event使用独立的etcd集群 867 | ```bash 868 | --etcd-servers-overrides="/events#https://1.2.3.1:2369;https://1.2.3.2:2369;https://1.2.3.3:2369" 869 | ``` 870 | 871 | ## 模拟list对kube-apiserver进行压测 872 | 10qps: 873 | ``` 874 | #!/bin/bash 875 | while true 876 | do 877 | for((i = 0; i<10; i++)); do 878 | { 879 | timeout 6 kubectl get --raw /api/v1/pods 1>/dev/null 880 | }& 881 | done 882 | sleep 1s 883 | echo "$(date) start the next loop..." 884 | done 885 | ``` 886 | 887 | ## 获取openapi json 888 | ```bash 889 | kubectl get --raw /openapi/v2 | jq > openapi.json 890 | ``` 891 | 此后可用*swagger*打开api文档。 892 | 893 | ## 从secret中获取证书信息 894 | ```bash 895 | function b642cert { 896 | local b64=$1 897 | echo $b64 | base64 -d | openssl x509 -noout -text 898 | } 899 | ``` 900 | 901 | ## 从KubeConfig文件中提取证书秘钥 902 | ```bash 903 | # TODO: 兼容配置有多个cluster、多个user的情况,需要通过current-context判断 904 | PATH_TO_KUBECONFIG=/root/.kube/config 905 | cat $PATH_TO_KUBECONFIG | grep certificate-authority-data | awk '{print $2}' | base64 -d > ca.crt 906 | cat $PATH_TO_KUBECONFIG | grep client-certificate-data | awk '{print $2}' | base64 -d > tls.crt 907 | cat $PATH_TO_KUBECONFIG | grep client-key-data | awk '{print $2}' | base64 -d > tls.key 908 | ``` 909 | 910 | 911 | ## 堆栈文件分析 912 | ```bash 913 | # goroutine统计 914 | grep ^goroutine xxx-goroutine-9.log -A 1 | grep -v "^goroutine\|^--" | sort | less 915 | ``` 916 | ## 根据sa生成kubeconfig 917 | ```bash 918 | # your server name goes here 919 | server=https://localhost:8443 920 | 921 | # sa ns and name 922 | sa_ns=kube-system 923 | sa_name=admin 924 | # the name of the secret containing the service account token goes here 925 | secret_name=$(kubectl get sa -n $sa_ns $sa_name -o json | jq .secrets[] -r | grep -- "-token-" | awk '{print $2}' | tr -d '"') 926 | 927 | ca=$(kubectl get -n $sa_ns secret/$secret_name -o jsonpath='{.data.ca\.crt}') 928 | token=$(kubectl get -n $sa_ns secret/$secret_name -o jsonpath='{.data.token}' | base64 --decode) 929 | 930 | echo " 931 | apiVersion: v1 932 | kind: Config 933 | clusters: 934 | - name: default-cluster 935 | cluster: 936 | certificate-authority-data: ${ca} 937 | server: ${server} 938 | contexts: 939 | - name: default-context 940 | context: 941 | cluster: default-cluster 942 | namespace: default 943 | user: default-user 944 | current-context: default-context 945 | users: 946 | - name: default-user 947 | user: 948 | token: ${token}" > sa.kubeconfig 949 | ``` 950 | 951 | ## kubeconfig跳过服务端证书校验 952 | ```bash 953 | clusters: 954 | - cluster: 955 | server: https://foo.bar:6443 956 | insecure-skip-tls-verify: true 957 | name: hehecluster 958 | ... 959 | ``` 960 | 961 | ## 定制kubectl输出 962 | ```bash 963 | # 定制输出 964 | kubectl get pod --sort-by=.status.startTime -o=custom-columns=name:.metadata.name,startTime:.status.startTime 965 | ``` 966 | 967 | ## kubectl patch操作 968 | 969 | 命令行: 970 | ```bash 971 | # merge方式 972 | kubectl patch mykind demo --type=merge --subresource status --patch 'status: {conditions: [{type: Degraded, status: "False", reason: AsExpected, message: "everything is ok", lastTransitionTime: "2024-07-11T09:08:47Z"}]}' 973 | 974 | # json方式 975 | kubectl patch bmh -n machine-api worker1 --subresource status --type='json' -p='[{"op": "replace", "path": "/status/hardware/hostname", "value": "hehe"}]' 976 | 977 | # 删除字段(以pod反亲和举例) 978 | kubectl patch deploy test -p '{"spec":{"template": {"spec": {"affinity": {"podAntiAffinity":null}}}}' 979 | ``` 980 | 981 | 从标准输入中打patch: 982 | ```bash 983 | cat << EEOOFF | kubectl patch mykind demo --type=merge --subresource status --patch-file=/dev/stdin 984 | status: 985 | conditions: 986 | - type: Available 987 | status: "True" 988 | lastTransitionTime: "2024-07-11T09:12:23Z" 989 | reason: AsExpected 990 | message: |- 991 | DemoServiceAvailable: All service is available 992 | EEOOFF 993 | ``` 994 | 995 | ## 常见操作 996 | 997 | ```bash 998 | # 查看pod和容器的创建、启动时间 999 | # 输入Pod相关信息 1000 | NS=default 1001 | POD=test-pod 1002 | # 获取ID信息 1003 | CID=$(kubectl describe pod -n $NS $POD | grep cri-o | cut -d/ -f3) 1004 | PODUID=$(kubectl get pod -n $NS $POD -o jsonpath='{.metadata.uid}') 1005 | # 在Pod所在节点执行,查看pod和容器创建日志 1006 | journalctl -u kubelet -u crio | grep "$POD\|$CID\|$PODUID" | grep "RemoteRuntimeService\|Created container\|Creating container\|Created container\|Starting container\|Started container" 1007 | 1008 | # 找到挂载主机根目录的pod 1009 | kubectl get pod -A -o=custom-columns=NAMESPACE:.metadata.namespace,NAME:.metadata.name,VOLUMES:.spec.volumes | grep hostPath | grep "path:/ " | awk '{print $1" "$2}' 1010 | 1011 | # 找到deploy对应的pod(使用 jq 的 to_entries ) 1012 | selector=$(kubectl get deploy -n $ns $name -o jsonpath='{.spec.selector.matchLabels}' | jq 'to_entries[]| .key + "=" + .value' -r | tr '\n' ',' | sed 's/,$//g') 1013 | kubectl get pod -n $ns -l $selector 1014 | 1015 | # 批量找到deploy对应的pod 1016 | for deploy in $(kubectl get deploy -A -o=custom-columns=NS:.metadata.namespace,NAME:.metadata.name,REPLICA:.spec.replicas | grep " 2$" | awk '{print $1"/"$2}'); do 1017 | ns=$(echo $deploy | cut -d/ -f1) 1018 | name=$(echo $deploy | cut -d/ -f2) 1019 | selector=$(kubectl get deploy -n $ns $name -o jsonpath='{.spec.selector.matchLabels}' | jq 'to_entries[]| .key + "=" + .value' -r | tr '\n' ',' | sed 's/,$//g') 1020 | nodes=$(kubectl get pod -owide -n $ns -l $selector -o=custom-columns=NODE:.spec.nodeName --no-headers | tr '\n' ' ') 1021 | printf "%-30s %-50s %s\n" "$ns" "$name" "$nodes" 1022 | done 1023 | 1024 | 1025 | # 手动拉取pod使用的容器镜像 1026 | function man_pull { 1027 | local ns=$1 1028 | local pod=$2 1029 | 1030 | for i in $(kubectl get pod -n ${ns} ${pod} -o json | jq .spec.containers[].image -r | sort | uniq); do 1031 | podman pull $i 1032 | done 1033 | } 1034 | 1035 | # 停止一个节点上的容器服务和所有容器 1036 | systemctl stop kubelet 1037 | crictl ps -q | xargs crictl stop 1038 | 1039 | # 以创建时间排序 1040 | kubectl get pod -A --sort-by .metadata.creationTimestamp 1041 | 1042 | # 查看API版本 1043 | kubectl api-versions 1044 | # 注意,OpenShift的Controller-Manager和Scheduler组件整合为controller组件,并使用https://x.x.x.x:8444/healthz作为健康检查endpoint 1045 | # OpenShift平台查看controller的健康情况 1046 | curl -k https://10.125.30.224:8444/healthz 1047 | # 查看集群组件信息 1048 | kubectl get componentstatus 1049 | kubectl get --raw /api/v1/componentstatuses/controller-manager | jq 1050 | kubectl get --raw /apis/metrics.k8s.io/v1beta1/namespaces/openshift-sdn/pods/sdn-5bbcx | jq 1051 | kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/pods/*/http_requests | jq 1052 | ./kubectl --server=https://kubernetes/ --certificate-authority=/tmp/openssl/ca.crt --client-certificate=/tmp/openssl/client.crt --client-key=/tmp/openssl/client.key get pod 1053 | /opt/bin/kubectl -s 127.0.0.1:8888 get pod -o wide 1054 | /opt/bin/kubectl -s 127.0.0.1:8888 describe ep 1055 | # 查看Pod信息,定位问题 1056 | /opt/bin/kubectl -s 127.0.0.1:8888 describe pod 1057 | /opt/bin/kubectl -s 127.0.0.1:8888 cluster-info 1058 | /opt/bin/kubectl -s 127.0.0.1:8888 get services 1059 | /opt/bin/kubectl -s 127.0.0.1:8888 get rc 1060 | # 自定义信息的输出列 1061 | /opt/bin/kubectl -s 127.0.0.1:8888 get nodes -o=custom-columns=NAME:.metadata.name,IPS:.status.addresses 1062 | kubelet --help 2>&1 | less 1063 | # node状态为Ready,SchedulingDisabled时,手工开启调度 1064 | /opt/bin/kubectl -s 127.0.0.1:8888 uncordon 172.25.18.13 1065 | # 查看Pod web-1中前一个ruby容器的日志 1066 | kubectl logs -p -c ruby web-1 1067 | # 支持json格式解析 1068 | kubectl get svc mysql-node1 -o jsonpath='{.spec.clusterIP}' 1069 | kubectl get pods -n default -l app=foo -o=jsonpath='{range .items[*]}{.metadata.name} {end}' 1070 | kubectl get namespaces -o jsonpath='{.items[*].metadata.name}' 1071 | /opt/bin/kubectl -s 127.0.0.1:8888 delete -f /opt/bin/confFile-cluster/openstack-new-rc.yaml 1072 | # 使用--field-selector过滤 1073 | kubectl get pod -A --field-selector spec.nodeName=zy-sno 1074 | # go template示例 1075 | kubectl get ns -o jsonpath='{range .items[*]} {.metadata.name}{"\n"} {end}' 1076 | kubectl get pod -A --field-selector spec.nodeName=$(hostname) -o jsonpath='{range .items[?(.spec.dnsPolicy=="Default")]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}' 1077 | kubectl get pod -A --field-selector spec.nodeName=$(hostname) -o jsonpath='{range .items[?(.spec.hostNetwork==true)]}{.metadata.namespace}{"/"}{.metadata.name}{"\n"}{end}' 1078 | kubectl get nodes --selector='node-role.kubernetes.io/master' -o jsonpath='{.items[0].status.conditions[?(@.type=="Ready")].status}' 1079 | kubectl get pod -o jsonpath='{.spec.containers[?(@.name=="dns")].image}' 1080 | kubectl get pod -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 1081 | kubectl get apirequestcounts -o jsonpath='{range .items[?(@.status.removedInRelease!="")]}{.status.removedInRelease}{"\t"}{.status.requestCount}{"\t"}{.metadata.name}{"\n"}{end}' 1082 | kubectl get apirequestcounts ingresses.v1beta1.networking.k8s.io \ 1083 | -o jsonpath='{range .status.last24h..byUser[*]}{..byVerb[*].verb}{","}{.username}{","}{.userAgent}{"\n"}{end}' 1084 | # 查看所有Pod 1085 | kubectl get pod | grep -v NAME | awk '{print $1}' 1086 | # 查看Pod的运行状态 1087 | kubectl get pod ceportalrc-n5sqd -o template --template={{.status.phase}} 1088 | # 查看Node的操作系统信息 1089 | kubectl get node 172.25.18.24 -o template --template={{.status.nodeInfo.osImage}} 1090 | # 查看容器的log 1091 | kubectl logs --namespace="kube-system" kube-dns-v17.1-rc1-27sj0 kubedns 1092 | kubectl drain ${node} --delete-emptydir-data --ignore-daemonsets --force 1093 | kubectl uncordon ${node} 1094 | # 给name为172.25.18.22的node打标签node: node3,kube-dns依赖于这个标签的。 1095 | kubectl label node 172.25.18.22 node=node3 1096 | kubectl label --overwrite node 172.25.19.119 nodeType=cellClus 1097 | # 删除节点的cellGrp标签 1098 | kubectl label node 172.25.19.117 cellGrp- 1099 | # k8s直接进容器 1100 | kubectl exec -it [-c ] 1101 | # https://kubernetes.io/docs/tasks/debug-application-cluster/get-shell-running-container/ 1102 | # 其中双横线--将k8s命令同希望容器里执行的命令分隔开 1103 | kubectl exec -- /node-cache -help 1104 | # 示例,通过别名,方便的使用工具pod里的命令 1105 | alias ceph='kubectl -n rook-ceph exec $(kubectl -n rook-ceph get pod -l "app=rook-ceph-tools" -o jsonpath='{.items[0].metadata.name}') -- ceph' 1106 | # 查看/修改RBAC 1107 | kubectl edit clusterrole 1108 | # 查看事件 1109 | kubectl get events 1110 | # 过滤查看Warning类型的事件 1111 | kubectl get events --field-selector type=Warning 1112 | # 过滤查看异常类型的事件 1113 | kubectl get events --field-selector type!=Normal 1114 | # 格式化输出event 1115 | kubectl get event -A --sort-by=.firstTimestamp -o=custom-columns=NS:.metadata.namespace,NAME:.metadata.name,FirstSeen:.firstTimestamp,LastSeen:.lastTimestamp,REASON:.reason 1116 | # 过滤查看某个pod的事件 1117 | kubectl get event --namespace ns --field-selector involvedObject.kind=Pod --field-selector involvedObject.name=xxx-yyy 1118 | curl -s 'http://1.2.3.4:8080/api/v1/namespaces/default/pods?labelSelector=app=rabbitmq,node=n2' | jq '.items[].metadata.name' | tr -d '"' 1119 | 1120 | # 通过curl直接访问Kubernetes的HTTPS RESTful API,注意: 1121 | # --cacert 指定CA中心的证书crt 1122 | # --cert 指定curl客户端的证书(公钥) 1123 | # --key 指定curl客户端的密码key(私钥),需要与--cert指定的证书对应 1124 | # 老平台支持 1125 | curl --cacert /root/openssl/ca.crt --cert /root/openssl/172.25.19.117-server.crt --key /root/openssl/172.25.19.117-server.key https://172.25.19.117:6443/api/ 1126 | # 容器内支持 1127 | curl --cacert /root/openssl/ca.crt --cert /root/openssl/client.crt --key /root/openssl/client.key https://kubernetes/api/ 1128 | # 老平台和Openshift新平台均支持 1129 | curl --cacert /root/openssl/ca.crt --cert /root/openssl/client.crt --key /root/openssl/client.key https://10.100.0.1/api/ 1130 | # Openshift新平台支持 1131 | curl --cacert /root/openssl/ca.crt --cert /root/openssl/client.crt --key /root/openssl/client.key https://openshift-m2:8443/api/ 1132 | NSS_SDB_USE_CACHE=yes curl --cacert /etc/origin/master/ca.crt --cert /etc/origin/master/admin.crt --key /etc/origin/master/admin.key https://vip.cluster.local:8443/api/ 1133 | NSS_SDB_USE_CACHE=yes curl --cacert /etc/origin/master/ca.crt --cert /etc/origin/master/admin.crt --key /etc/origin/master/admin.key https://$(hostname):8443/apis/metrics.k8s.io/v1beta1?timeout=32s 1134 | 1135 | # 通过文件创建secret,其中指定secret中的键/文件名为htpasswd 1136 | kubectl create secret generic htpass-secret --from-file=htpasswd= -n kube-system 1137 | 1138 | ## 通过token直接访问apiserver 1139 | # 找到 default sa的携带token信息的secrets 1140 | kubectl get sa default -o yaml 1141 | # 直接从secrets中获取TOKEN 1142 | kubectl get secrets default-token-xxxxx -o jsonpath='{.data.token}' | base64 -d 1143 | # 从secrets中复原证书和秘钥 1144 | kubectl get secrets -n cattle-system tls-cert -o jsonpath='{.data.cert\.pem}' | base64 -d > cert.pem 1145 | NSS_SDB_USE_CACHE=yes curl -H "Authorization: Bearer ${TOKEN}" -k https://10.100.0.1/api/ 1146 | 1147 | # Pod(容器)里直接获取token的方法 1148 | TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) 1149 | NSS_SDB_USE_CACHE=yes curl -s -H "Authorization: Bearer ${TOKEN}" -k https://10.100.0.1/api/v1/nodes?labelSelector=nodeType%3Dcontroller | jq -r .items[].metadata.name 1150 | 1151 | # 从SA(serviceaccount)处获取token的方法 1152 | NS=default 1153 | SA=admin 1154 | TOKEN=$(kubectl get secrets -n ${NS} $(kubectl get sa -n ${NS} ${SA} -o jsonpath='{.secrets[0].name}') -o jsonpath='{.data.token}' | base64 -d) 1155 | 1156 | # kubectl使用token 1157 | # XXX:需要说明的,如果有~/.kube/config文件,kubectl还是优先使用该kubeconfig文件 1158 | kubectl get pod --token ${TOKEN} -s https://api.foo.bar:6443 --insecure-skip-tls-verify 1159 | 1160 | # 模仿Pod内使用in-cluster配置访问apiserver 1161 | NS=default 1162 | POD=test 1163 | TOKEN=$(kubectl exec -n ${NS} ${POD} -- cat /var/run/secrets/kubernetes.io/serviceaccount/token) 1164 | curl -H "Authorization: Bearer ${TOKEN}" -k https://kubernetes.default.svc:443/api/v1/nodes 1165 | 1166 | 1167 | # 设置默认StorageClass 1168 | kubectl patch storageclass gold -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' 1169 | 1170 | ``` 1171 | 1172 | ## 资源遍历 1173 | 1174 | ### 遍历列出所有的资源类型及支持的操作 1175 | ```bash 1176 | # do core resources first, which are at a separate api location 1177 | api="core" 1178 | kubectl get --raw /api/v1 | jq -r --arg api "$api" '.resources | .[] | "\($api) \(.name): \(.verbs | join(" "))"' 1179 | 1180 | # now do non-core resources 1181 | APIS=$(kubectl get --raw /apis | jq -r '[.groups | .[].name] | join(" ")') 1182 | for api in $APIS; do 1183 | version=$(kubectl get --raw /apis/$api | jq -r '.preferredVersion.version') 1184 | kubectl get --raw /apis/$api/$version | jq -r --arg api "$api" '.resources | .[]? | "\($api) \(.name): \(.verbs | join(" "))"' 1185 | done 1186 | ``` 1187 | 1188 | ### 遍历所有pod 1189 | ```bash 1190 | for n_p in $(kubectl get pod -A | sed 1d | awk '{print $1":"$2}'); do 1191 | n=$(echo $n_p | cut -d: -f1) 1192 | p=$(echo $n_p | cut -d: -f2) 1193 | echo $n $p 1194 | kubectl get pod -n $n $p -o json | jq .spec.containers[].imagePullPolicy -r 2>/dev/null 1195 | kubectl get pod -n $n $p -o json | jq .spec.initContainers[].imagePullPolicy -r 2>/dev/null 1196 | echo 1197 | done 1198 | ``` 1199 | 1200 | ### 遍历所有pod及其容器 1201 | ```bash 1202 | pod_temp_file=$(mktemp pod_temp.XXXXX) 1203 | kubectl get namespace -o json | jq -r '.items[].metadata.name' | while read -r ns; do 1204 | kubectl get pod -n $ns -o json | jq -r '.items[].metadata.name' | while read -r pod; do 1205 | kubectl get pod -n $ns $pod -o json > $pod_temp_file 1206 | jq -r '.spec | select(.initContainers != null) |.initContainers[].name' $pod_temp_file | while read -r ic; do 1207 | echo $ns $pod $ic 1208 | done 1209 | jq -r '.spec | select(.containers != null) | .containers[].name' $pod_temp_file | while read -r c; do 1210 | echo $ns $pod $c 1211 | done 1212 | done 1213 | done 1214 | rm -f $pod_temp_file 1215 | ``` 1216 | 1217 | ### 遍历所有工作负载 1218 | ```bash 1219 | WorkLoads="ds deploy rc sts" 1220 | for wl in $(echo $WorkLoads); do 1221 | echo "============== $wl ==============" 1222 | for n_i in $(kubectl get $wl -A | sed 1d | awk '{print $1":"$2}'); do 1223 | n=$(echo $n_i | cut -d: -f1) 1224 | i=$(echo $n_i | cut -d: -f2) 1225 | echo $n $i : $(kubectl get $wl -n $n $i -o json | jq .spec.template.spec.containers[].imagePullPolicy -r 2>/dev/null) $(kubectl get $wl -n $n $i -o json | jq .spec.template.spec.initContainers[].imagePullPolicy -r 2>/dev/null) 1226 | done 1227 | done 1228 | ``` 1229 | 1230 | ### 遍历一个命名空间下所有资源 1231 | ```bash 1232 | NAMESPACE=default 1233 | kubectl api-resources --verbs=list --namespaced -o name \ 1234 | | xargs -n 1 kubectl get --show-kind --ignore-not-found -n ${NAMESPACE} 1235 | 1236 | # 统计 1237 | NAMESPACE=default 1238 | for t in $(kubectl api-resources --verbs=list --namespaced -o name); do echo "$t: $(kubectl get $t --ignore-not-found -n ${NAMESPACE} | wc -l)"; done 1239 | ``` 1240 | 1241 | ### 遍历一个命名空间下所有资源的label和annotations 1242 | ```bash 1243 | NAMESPACE=default 1244 | for api in $(kubectl api-resources --verbs=list --namespaced -o name); do 1245 | kubectl get ${api} --ignore-not-found -n ${NAMESPACE} -o json | jq .items[].metadata.labels 1246 | done 1247 | for api in $(kubectl api-resources --verbs=list --namespaced -o name); do 1248 | kubectl get ${api} --ignore-not-found -n ${NAMESPACE} -o json | jq .items[].metadata.annotations 1249 | done 1250 | ``` 1251 | 1252 | ### 遍历所有区分命名空间的资源的内容 1253 | ```bash 1254 | for k in $(kubectl api-resources --verbs=list --namespaced -o name); do 1255 | for ns in $(kubectl get ns -o custom-columns=NAME:.metadata.name --no-headers); do 1256 | for n in $(kubectl get --ignore-not-found -o custom-columns=NAME:.metadata.name --no-headers $k -n $ns 2>/dev/null); do 1257 | output=$(kubectl get $k -n $ns $n -o yaml 2>/dev/null | grep hehe) 1258 | if [ "$output" != "" ]; then 1259 | echo $k $ns $n "$output" 1260 | fi 1261 | done 1262 | done 1263 | done 1264 | ``` 1265 | 1266 | ### 遍历所有跨命名空间的资源 1267 | ```bash 1268 | kubectl api-resources --verbs=list --namespaced=false -o name \ 1269 | | xargs -n 1 kubectl get --show-kind --ignore-not-found 1270 | ``` 1271 | 1272 | ### 遍历所有跨命名空间的资源的label和annotations 1273 | ```bash 1274 | for api in $(kubectl api-resources --verbs=list --namespaced=false -o name); do 1275 | kubectl get ${api} --ignore-not-found -o json | jq .items[].metadata.labels 1276 | done 1277 | for api in $(kubectl api-resources --verbs=list --namespaced=false -o name); do 1278 | kubectl get ${api} --ignore-not-found -o json | jq .items[].metadata.annotations 1279 | done 1280 | ``` 1281 | 1282 | ### 遍历所有跨命名空间的资源的内容 1283 | ```bash 1284 | for k in $(kubectl api-resources --verbs=list --namespaced=false -o name); do 1285 | for n in $(kubectl get --ignore-not-found -o custom-columns=NAME:.metadata.name --no-headers $k 2>/dev/null); do 1286 | output=$(kubectl get $k $n -o yaml 2>/dev/null | grep hehe) 1287 | if [ "$output" != "" ]; then 1288 | echo $k $n "$output" 1289 | fi 1290 | done 1291 | done 1292 | ``` 1293 | 1294 | ### 遍历所有pod的cpu request配置 1295 | ```bash 1296 | # 统计pod的cpu request 1297 | POD_TEMP_RESULT_FILE=$(mktemp) 1298 | 1299 | kubectl get pod -A -o json > $POD_TEMP_RESULT_FILE 1300 | 1301 | cat $POD_TEMP_RESULT_FILE | jq -r '.items[] | .metadata.namespace + " " + .metadata.name' | while read -r ns pod; do 1302 | pod_yaml=$(cat $POD_TEMP_RESULT_FILE | jq -r --arg ns "$ns" --arg pod "$pod" '.items[] | select(.metadata.namespace == $ns) | select(.metadata.name == $pod)') 1303 | 1304 | for c in $(echo $pod_yaml | jq -r '.spec.containers[].name'); do 1305 | c_yaml=$(echo $pod_yaml | jq -r --arg c "$c" '.spec.containers[] | select(.name == $c)') 1306 | cpu_req=$(echo $c_yaml | jq -r .resources.requests.cpu) 1307 | 1308 | printf "%-46s %-64s %-40s %s\n" $ns $pod $c $cpu_req 1309 | done 1310 | done 1311 | 1312 | 1313 | rm -f $POD_TEMP_RESULT_FILE 1314 | ``` 1315 | 1316 | ## 客户端访问集群时context配置 1317 | 1318 | ```bash 1319 | # 注意,ca.crt、client.crt和client.key需要来自目标集群,例如配置中的deploy-cluster 1320 | kubectl config set-cluster deploy-cluster --server=https://${KUBE_APISERVER_IP_OR_DOMAINNAME}:${KUBE_APISERVER_PORT} --certificate-authority=./ca.crt --embed-certs=true 1321 | 1322 | kubectl config set-credentials deploy-user --client-key=./client.key --client-certificate=./client.crt --embed-certs=true 1323 | # 或者 1324 | kubectl config set-credentials local-cluster-user --token=eyJhb 1325 | 1326 | kubectl config set-context deploy-context --cluster=deploy-cluster --user=deploy-user --namespace=default 1327 | 1328 | # 切换到deploy-cluster集群,注意,后面的kubectl都是在deploy-cluster上操作 1329 | kubectl config use-context deploy-context 1330 | ``` 1331 | 1332 | 1333 | 1334 | ## ConfigMap使用 1335 | 1336 | 将配置/模板文件保存到configMap并提取出来 1337 | 1338 | ~~~ 1339 | kubectl create configmap hehe --from-file=mysql-node-rc-template.yaml 1340 | kubectl get cm hehe -o jsonpath='{.data.mysql-node-rc-template\.yaml}' 1341 | ~~~ 1342 | 1343 | 创建加更新ConfigMap 1344 | 1345 | ~~~ 1346 | kubectl create configmap -n default os-watchdog-config --from-file=i18n_zh.json --from-file=i18n_en.json -o yaml --dry-run | kubectl apply -f - 1347 | ~~~ 1348 | 1349 | ## 日志相关配置 1350 | 1351 | ```bash 1352 | --log-dir=/var/log/kubernetes --logtostderr=false --v=4 1353 | ``` 1354 | 1355 | ## 提升集群HA性能 1356 | kubelet设置 `--node-status-update-frequency` 参数,例如从默认值10s调整为5s,提升节点状态变化感知效率。 1357 | kube-controller-manager设置 `--node-monitor-grace-period` 参数,例如从默认值40s调整为16s,提升节点变化响应速度。 1358 | 1359 | 1360 | 1361 | ## 强制删除Pod 1362 | 1363 | ```bash 1364 | kubectl delete pods --grace-period=0 --force 1365 | ``` 1366 | 1367 | ## Pod中获取PodIP的方法 1368 | 通过 [Downward API](https://kubernetes.io/docs/concepts/workloads/pods/downward-api/) ,可在Pod中获取例如PodIP之类的信息。 1369 | 这些信息属于Pod/容器自己的信息,容器初始化和运行的时候,获取这些信息有助于灵活配置。 1370 | 1371 | 有两种方式将这些信息提供到Pod内: 1372 | * [以环境变量方式](https://kubernetes.io/docs/tasks/inject-data-application/environment-variable-expose-pod-information/) 1373 | * [以文件/volume方式](https://kubernetes.io/docs/tasks/inject-data-application/downward-api-volume-expose-pod-information/) ,特别适用于**标签**和**注解** 1374 | 1375 | 例如以环境变量方式: 1376 | ```bash 1377 | env: 1378 | - name: MYIP 1379 | valueFrom: 1380 | fieldRef: 1381 | fieldPath: status.podIP 1382 | - name: RESOLVER_IP_ADDR 1383 | valueFrom: 1384 | fieldRef: 1385 | fieldPath: status.hostIP 1386 | ``` 1387 | 1388 | 注意: 1389 | 1. 仅kubernetes v1.8+版本支持。 1390 | 2. 仅支持部分字段,详见链接 [Downward API](https://kubernetes.io/docs/concepts/workloads/pods/downward-api/) 1391 | 3. 容器中使用环境变量,在*args*中若还未被容器内shell解析则应指定为`$(ENV_VAR_KEY)`,若在shell执行器后指定则为`${ENV_VAR_KEY}` 1392 | 1393 | ## emptyDir在宿主机上的路径 1394 | 1395 | ```bash 1396 | # emptyDir文件夹路径 1397 | /var/lib/kubelet/pods//volumes/kubernetes.io~empty-dir 1398 | 1399 | # 查找一个emptyDir文件夹中的文件,一种简便(但效率较低)的查找方法 1400 | find /var/lib/kubelet/pods/*/volumes/kubernetes.io~empty-dir -name "file-name" 1401 | ``` 1402 | 1403 | ### 节点上emptyDir用量统计 1404 | ```bash 1405 | for d in $(sudo find /var/lib/kubelet/pods -type d -name "*empty-dir*" 2>/dev/null); do 1406 | sudo du -sh $d 1407 | done 1408 | 1409 | # 排除掉空文件夹 1410 | for d in $(sudo find /var/lib/kubelet/pods -type d -name "*empty-dir*" 2>/dev/null); do 1411 | sudo du -sh $d 1412 | done | grep -v "^0\>" 1413 | ``` 1414 | 1415 | ### 远程到节点统计emptyDir用量 1416 | 一个复杂的,借助`sh -c`远程执行`find`、`sudo`、`du`命令的示例: 1417 | ```bash 1418 | ssh $node_ip 'sh -c "sudo find /var/lib/kubelet/pods/ -maxdepth 3 -name "kubernetes.io~empty-dir" -type d -exec du -s {} \;"' | grep -vw ^0 | awk '{print $2}' 1419 | ``` 1420 | 1421 | ## FC存储多路径的PV配置 1422 | 1423 | ```bash 1424 | apiVersion: v1 1425 | kind: PersistentVolume 1426 | metadata: 1427 | name: hehe-pv 1428 | spec: 1429 | capacity: 1430 | storage: 10Gi 1431 | accessModes: 1432 | - ReadWriteOnce 1433 | volumeMode: Block 1434 | persistentVolumeReclaimPolicy: Retain 1435 | fc: 1436 | targetWWNs: ["21120002ac012e3b", "20110002ac012e3b"] 1437 | lun: 9 1438 | fsType: ext4 1439 | readOnly: false 1440 | ``` 1441 | WWN和lun在 /dev/disk/by-path 中获取,格式为 `/dev/disk/by-path/pci--fc-0x-lun-`,例如 1442 | ```bash 1443 | [root@devops1 by-path]# pwd 1444 | /dev/disk/by-path 1445 | [root@devops1 by-path]# ls | grep fc 1446 | ... 1447 | pci-0000:18:00.0-fc-0x21120002ac012e3b-lun-9 1448 | ... 1449 | pci-0000:18:00.1-fc-0x20110002ac012e3b-lun-9 1450 | ``` 1451 | 由于存储多路径,同一个LUN对应填写两个WWN,上述LUN-9对应 WWN 21120002ac012e3b 和 WWN 20110002ac012e3b 。 1452 | 1453 | 1454 | ## 编译kubelet 1455 | 使用构建镜像编译: 1456 | ```bash 1457 | docker run -it --privileged \ 1458 | -v ${PWD}/kubernetes:/go/src/github.com/kubernetes/kubernetes \ 1459 | -e GOPATH=/go \ 1460 | -w /go/src/github.com/kubernetes/kubernetes k8s.gcr.io/build-image/kube-cross:v1.15.8-legacy-1 sh 1461 | # 需要编什么架构,就export什么架构: 1462 | # export KUBE_BUILD_PLATFORMS=linux/arm64 1463 | export KUBE_BUILD_PLATFORMS=linux/amd64 1464 | make WHAT=cmd/kubelet GOLDFLAGS="" 1465 | ``` 1466 | 1467 | ## 获取k8s控制面组件指标 1468 | 1469 | **kube-apiserver**: 1470 | ```bash 1471 | # kube-apiserver 1472 | kubectl get --raw /metrics 1473 | ``` 1474 | 1475 | **kubelet**: 1476 | ```bash 1477 | # 从kubeconfig里拿ca.crt user.crt user.key 1478 | curl --cacert ./ca.crt --cert ./user.crt --key ./user.key https://x.x.x.x:10257/metrics -k 1479 | ``` 1480 | 1481 | ## kubeadm部署的集群的操作 1482 | ```bash 1483 | # 从kubelet的metrics里,查看编译时用的golang版本: 1484 | curl -sk https://127.0.0.1:10250/metrics --cacert /etc/kubernetes/pki/ca.crt --cert /etc/kubernetes/pki/apiserver-kubelet-client.crt --key /etc/kubernetes/pki/apiserver-kubelet-client.key | grep go_info 1485 | ``` 1486 | 1487 | ## kube-apiserver内部本地访问客户端 1488 | 检查证书有效期: 1489 | ```bash 1490 | MASTER_IP=1.2.3.4 1491 | KUBE_APISERVER_PORT=6443 1492 | curl --resolve apiserver-loopback-client:${KUBE_APISERVER_PORT}:${MASTER_IP} -k -v https://apiserver-loopback-client:${KUBE_APISERVER_PORT}/healthz 1493 | ``` 1494 | 1495 | 详见文章[kubernetes 究竟有没有 LTS?](https://mp.weixin.qq.com/s/3dATYVtgcQDxEOKR5XNofg) 1496 | 1497 | ## 读取 kubelet_internal_checkpoint 1498 | ```bash 1499 | jq --arg PodUID "xxx" '.Data.PodDeviceEntries[] | select(.PodUID == $PodUID) | select(.ContainerName == "hehe") | select(.ResourceName == "foo.bar/gpu")' /var/lib/kubelet/device-plugins/kubelet_internal_checkpoint | jq '.DeviceIDs["111"]'[] 1500 | ``` 1501 | 1502 | # 最佳实践 1503 | ## 使用finalizers拦截资源删除 1504 | 1505 | ### 手动清理finalizers 1506 | ```bash 1507 | kubectl patch pod xxx --type='json' -p='[{"op": "remove", "path": "/metadata/finalizers"}]' 1508 | ``` 1509 | 1510 | ## 资源限制 1511 | ### 容器进程数限制pids 1512 | TODO: 1513 | * https://kubernetes.io/docs/concepts/policy/pid-limiting/ 1514 | * https://access.redhat.com/articles/7033551 1515 | 1516 | 当kubelet的`podPidsLimit`设置为4096时: 1517 | ```bash 1518 | cd /sys/fs/cgroup/pids/kubepods.slice 1519 | # 查看一个pod的pids设置 1520 | for p in $(find . -name "pids.max"); do echo "$(cat $p) $p"; done | grep kubepods-burstable-podxxx.slice 1521 | 203348 ./kubepods-burstable.slice/kubepods-burstable-podxxx.slice/crio-111.scope/pids.max 1522 | 203348 ./kubepods-burstable.slice/kubepods-burstable-podxxx.slice/crio-conmon-222.scope/pids.max 1523 | 203348 ./kubepods-burstable.slice/kubepods-burstable-podxxx.slice/crio-222.scope/pids.max 1524 | max ./kubepods-burstable.slice/kubepods-burstable-podxxx.slice/crio-/pids.max 1525 | 4096 ./kubepods-burstable.slice/kubepods-burstable-podxxx.slice/pids.max 1526 | 203348 ./kubepods-burstable.slice/kubepods-burstable-podxxx.slice/crio-conmon-111.scope/pids.max 1527 | ``` 1528 | 可看到: 1529 | 1. sandbox pod的pids未设限,pids.max置为max 1530 | 2. 业务容器默认置为`203348` 1531 | 3. 整个pod的pids.max被置为4096 1532 | 1533 | 代码实现详见:*pkg/kubelet/cm/pod_container_manager_linux.go* 1534 | 1535 | ## HPA 1536 | 参考链接[kubernetes-hpa-configuration-guide](https://segment.com/blog/kubernetes-hpa-configuration-guide/) 1537 | 1538 | ## 集群内通过svc访问外部服务 1539 | ```bash 1540 | cat << EEOOFF | kubectl apply -f - 1541 | apiVersion: v1 1542 | kind: Namespace 1543 | metadata: 1544 | name: etcd 1545 | --- 1546 | apiVersion: v1 1547 | kind: Endpoints 1548 | metadata: 1549 | name: etcd 1550 | namespace: etcd 1551 | subsets: 1552 | - addresses: 1553 | - ip: 1.2.3.4 1554 | - ip: 1.2.3.5 1555 | - ip: 1.2.3.6 1556 | ports: 1557 | - port: 2379 1558 | --- 1559 | apiVersion: v1 1560 | kind: Service 1561 | metadata: 1562 | labels: 1563 | k8s-app: etcd 1564 | name: etcd 1565 | namespace: etcd 1566 | spec: 1567 | ports: 1568 | - name: etcd 1569 | port: 2379 1570 | protocol: TCP 1571 | targetPort: 2379 1572 | EEOOFF 1573 | ``` 1574 | 1575 | # 性能调优 1576 | ## 读懂监控指标 1577 | ### etcd监控指标 1578 | 告警经验值: 1579 | 1580 | | 指标 | label | 说明 | 告警值 | 1581 | |------------------------------|--------------------------|-----|---------| 1582 | | grpc_server_handling_seconds | grpc_method="Txn" | | P99 0.5 | 1583 | | grpc_server_handling_seconds | grpc_method="Range" | | P99 0.5 | 1584 | | grpc_server_handling_seconds | grpc_method="LeaseGrant" | | P99 0.5 | 1585 | | grpc_server_handling_seconds | grpc_method="MemberList" | | P99 0.5 | 1586 | 1587 | ### kube-apiserver监控指标 1588 | 1589 | | 指标 | label | 说明 | 告警值 | 1590 | |-------------------------------|-----------------|--------------------------------------------------------------------|-----| 1591 | | etcd_db_total_size_in_bytes | | Total size of the etcd database file physically allocated in bytes | | 1592 | | etcd_bookmark_counts | resource | Number of etcd bookmarks (progress notify events) split by kind | | 1593 | | etcd_lease_object_counts | | Number of objects attached to a single etcd lease | | 1594 | | etcd_request_duration_seconds | operation, type | Etcd request latency in seconds for each operation and object type | | 1595 | | apiserver_storage_objects | resource | Number of stored objects at the time of last check split by kind | | 1596 | 1597 | 1598 | ### kube-controller-manager监控指标 1599 | 1600 | ### kube-scheduler监控指标 1601 | 1602 | ### kubelet监控指标 1603 | 1604 | ## 内存优化 1605 | [k8s client-go内存优化](https://blog.ayanamist.com/2022/10/28/k8s-informer-mem-optimize.html): 1606 | * 优先使用Protobuf而不是JSON 1607 | * 流式list,避免informer首次list时置`resourceVersion=0`,全量拉取数据并一起做反序列化,相关[KEP-3157: allow informers for getting a stream of data instead of chunking](https://github.com/kubernetes/enhancements/tree/master/keps/sig-api-machinery/3157-watch-list) 1608 | 1609 | ## 查看defaultCpuSet核上CPU使用量 1610 | ```bash 1611 | function default_cores { 1612 | input=$(cat /var/lib/kubelet/cpu_manager_state | jq -r .defaultCpuSet) 1613 | IFS=',' read -ra ADDR <<< "$input" 1614 | for item in "${ADDR[@]}"; do 1615 | if [[ "$item" == *"-"* ]]; then 1616 | start=$(echo "$item" | cut -d'-' -f1) 1617 | end=$(echo "$item" | cut -d'-' -f2) 1618 | for (( i=$start; i<=$end; i++ )); do 1619 | echo $i 1620 | done 1621 | else 1622 | echo $item 1623 | fi 1624 | done 1625 | } 1626 | 1627 | function cores_util { 1628 | temp_result=$(mktemp) 1629 | ps -eLo pid,tid,comm,pcpu,psr > $temp_result 1630 | for c in $(default_cores); do 1631 | util=$(cat $temp_result | grep " $c$" | awk '{s+=$4}END{print s}') 1632 | printf "Core %2d ============================== total usage %s%%\n" $c $util 1633 | cat $temp_result | grep " $c$" | sort -rnk4 | head -n 3 1634 | done 1635 | 1636 | rm -f $temp_result 1637 | } 1638 | ``` 1639 | 1640 | # Deep Dive系列 1641 | ## kube-apiserver 1642 | 1643 | ### 服务启动流程 1644 | 起点`kubernetes/cmd/kube-apiserver/app/server.go`中`CreateServerChain()` 。 1645 | 1646 | 依次经过*Aggregator*、 *KubeAPIServer*、 *APIExtensionServer*三个组件处理请求。 1647 | 1648 | ### 服务端fieldSelector 1649 | XXX TODO 1650 | 1651 | ### REST Storage 1652 | `kubernetes/pkg/registry/core/rest/storage_core.go`中`NewLegacyRESTStorage` 。 1653 | 1654 | ### 安装API及其REST Storage 1655 | `vendor/k8s.io/apiserver/pkg/server/genericapiserver.go`中`InstallAPIGroups`和`InstallLegacyAPIGroup` 1656 | 1657 | 通过*go-restful*实现API服务,*go-restful*的*Container*在`vendor/k8s.io/apiserver/pkg/server/handler.go`中`NewAPIServerHandler`初始化。 1658 | 1659 | `APIGroupVersion`的`Storage`中,有该*GroupVersion*下所有*resources*的`rest.Storage`。 1660 | 1661 | ### API定义和版本 1662 | ```golang 1663 | // Pod is a collection of containers, used as either input (create, update) or as output (list, get). 1664 | type Pod struct { 1665 | metav1.TypeMeta 1666 | // +optional 1667 | metav1.ObjectMeta 1668 | 1669 | // Spec defines the behavior of a pod. 1670 | // +optional 1671 | Spec PodSpec 1672 | 1673 | // Status represents the current information about a pod. This data may not be up 1674 | // to date. 1675 | // +optional 1676 | Status PodStatus 1677 | } 1678 | ``` 1679 | 其中: 1680 | - **继承**了`metav1.TypeMeta`和`metav1.ObjectMeta`,即直接拥有通用属性和方法,实现`runtime.Object`等接口 1681 | - **组合**了`PodSpec`和`PodStatus`,指定该资源特性属性 1682 | 1683 | API版本: 1684 | - **外部**版本:`staging/src/k8s.io/api/core/v1/types.go` 1685 | - **内部**版本:`pkg/apis/core/types.go` 1686 | 1687 | `k8s.io/apimachinery/pkg/runtime/serializer/versioning/versioning.go`中`codec`实现: 1688 | - 内外部版本的转化 1689 | - 序列化、反序列化 1690 | 1691 | ### 序列化和反序列化 1692 | _json_、_protobuf_、*yaml*格式的序列化和反序列化实现在`staging/src/k8s.io/apimachinery/pkg/runtime/serializer`中。 1693 | 1694 | #### TypeMeta的反序列化 1695 | 以*json*为例,在`staging/src/k8s.io/apimachinery/pkg/runtime/serializer/json/meta.go`的`SimpleMetaFactory.Interpret()`中, 1696 | 借助`go/src/encoding/json/decode.go`实现对`metav1.TypeMeta`的反序列化,获取*GVK* 。 1697 | 1698 | #### 外部版本的序列化和反序列化 1699 | `staging/src/k8s.io/apimachinery/pkg/runtime/serializer/json/json.go`中`Serializer.Decode()` ,实现外部版本的序列化和反序列化操作。 1700 | 1701 | #### codec和codec factory 1702 | [TODO](https://cloud.tencent.com/developer/article/1891182) 1703 | 1704 | *codec*将内部版本转换为外部版本,并序列化。 1705 | 1706 | `staging/src/k8s.io/apimachinery/pkg/runtime/serializer/versioning/versioning.go`: 1707 | ```golang 1708 | type codec struct { 1709 | encoder runtime.Encoder 1710 | decoder runtime.Decoder 1711 | convertor runtime.ObjectConvertor 1712 | creater runtime.ObjectCreater 1713 | typer runtime.ObjectTyper 1714 | defaulter runtime.ObjectDefaulter 1715 | 1716 | encodeVersion runtime.GroupVersioner 1717 | decodeVersion runtime.GroupVersioner 1718 | 1719 | identifier runtime.Identifier 1720 | 1721 | // originalSchemeName is optional, but when filled in it holds the name of the scheme from which this codec originates 1722 | originalSchemeName string 1723 | } 1724 | ``` 1725 | 1726 | `CodecFactory`环境方法: 1727 | * `DecoderToVersion`,返回反序列化并转化为内部版本的`Decoder`。 1728 | * `EncoderForVersion`,返回转换为特定外部版本并序列化的`Encoder`,编码过程中首先将对象(一般为内部版本)转化为目标版本,再序列化到响应数据流中。 1729 | 1730 | ### 资源schema 1731 | 参见[链接](https://cloud.tencent.com/developer/article/1902710) 。 1732 | 1733 | GVK和资源model的对应关系,资源model的默认值,资源在不同版本间转化的函数等,均由资源schema维护。 1734 | 1735 | ### 健康检查/healthz 1736 | 检查三个方面: 1737 | 1. 初始配置时,增加默认检查方法,包括`k8s.io/apiserver/pkg/server/healthz`中`PingHealthz`和`LogHealthz` 1738 | 2. 检查存储后端(etcd)是否健康,使用`k8s.io/apiserver/pkg/storage/storagebackend/factory`中`CreateHealthCheck()`创建检查方法 1739 | 3. 若通过`--encryption-provider-config`配置KMS加密,使用`k8s.io/apiserver/pkg/server/options/encryptionconfig`中`GetKMSPluginHealthzCheckers()`创建检查方法 1740 | 1741 | ``` 1742 | [+]ping ok 1743 | [+]log ok 1744 | [-]etcd failed: reason withheld 1745 | [+]poststarthook/start-kube-apiserver-admission-initializer ok 1746 | [+]poststarthook/generic-apiserver-start-informers ok 1747 | [+]poststarthook/start-apiextensions-informers ok 1748 | [+]poststarthook/start-apiextensions-controllers ok 1749 | [+]poststarthook/crd-informer-synced ok 1750 | [+]poststarthook/bootstrap-controller ok 1751 | [+]poststarthook/rbac/bootstrap-roles ok 1752 | [+]poststarthook/scheduling/bootstrap-system-priority-classes ok 1753 | [+]poststarthook/apiserver/bootstrap-system-flowcontrol-configuration ok 1754 | [+]poststarthook/start-cluster-authentication-info-controller ok 1755 | [+]poststarthook/start-kube-aggregator-informers ok 1756 | [+]poststarthook/apiservice-registration-controller ok 1757 | [+]poststarthook/apiservice-status-available-controller ok 1758 | [+]poststarthook/kube-apiserver-autoregistration ok 1759 | [+]autoregister-completion ok 1760 | [+]poststarthook/apiservice-openapi-controller ok 1761 | healthz check failed 1762 | ``` 1763 | 1764 | ### 就绪检查/readyz 1765 | 1. `kube-apiserver`的`shutdown-delay-duration`参数控制优雅退出。 1766 | 2. 在`kube-apiserver`退出期间,就绪检查失败、但健康检查ok,确保*in flight*的请求能正常处理,但不要有新的建连和请求上来 1767 | 3. `kube-apiserver`的`shutdown-send-retry-after`控制在优雅退出期间,有新请求到来时,返回`retry` 1768 | 4. 实现逻辑在`k8s.io/apiserver/pkg/server/healthz`,详见`func (s *GenericAPIServer) AddReadyzChecks(checks ...healthz.HealthChecker) error` 1769 | 1770 | ### node authorizer实现 1771 | `plugin/pkg/auth/authorizer/node/graph.go`中为同node相关的资源创建的graph: 1772 | ``` 1773 | volume attachment -> node 1774 | pod -> node 1775 | sa -> pod // pod service account 1776 | secret -> pod // every secret referenced by the pod, e.g. ImagePullSecrets, Container Env from secret, Volumes' secret ref 1777 | cm -> pod // every cm referenced by the pod, e.g. Container Env from cm, cm volumes 1778 | pvc -> pod // every pvc referenced by the pod in volumes 1779 | pv -> pvc 1780 | secret -> pv // every secret referenced by the PV spec 1781 | ``` 1782 | 在 `authorization-mode` 的 `node` 中,根据上述资源与节点的关系图`graph`判断节点是否有访问权限。 1783 | 1784 | ## kube-controller-manager 1785 | 1786 | ### 配置和初始化 1787 | 1788 | ### leader选举 1789 | 1790 | ### 核心Controller 1791 | 1792 | ## kube-scheduler 1793 | 1794 | ### 配置和初始化 1795 | 1796 | ### leader选举 1797 | 1798 | ### 资源调度 1799 | 1800 | ## kubelet 1801 | 1802 | ### 配置和初始化 1803 | `kubeletConfiguration v1beta1`的默认配置在*pkg/kubelet/apis/config/v1beta1/defaults.go* 中 *SetDefaults_KubeletConfiguration()* 设置。 1804 | 1805 | ### PLEG 1806 | 1807 | ### 调用CRI接口 1808 | 容器拉起流程 `kubelet` --cri--> `cri-o` --oci--> `runc`。 1809 | 1810 | * **_cri_**接口`k8s.io/cri-api/pkg/apis/runtime/v1/api.pb.go`,例如由`LinuxContainerResources`定义容器的资源配置。 1811 | * **_oci_**接口`github.com/opencontainers/runtime-spec/specs-go/config.go`,例如由`LinuxResources`定义容器的资源配置。 1812 | 1813 | 1814 | ### (间接)通过CNI接口管理网络 1815 | 1816 | ### 通过CSI管理存储 1817 | 1818 | ### 设备和资源管理 1819 | 1820 | #### 资源计算和预留 1821 | 1822 | ##### 为容器进程设置oom_score_adj 1823 | 针对不同服务质量和优先级的pod,在创建容器(拉起进程时)kubelet会设置不同的*oom_score_adj*,具体的: 1824 | * *Guaranteed* 为 -997 1825 | * *BestEffort* 为 1000 1826 | * *Burstable* 根据公式 `min(max(2, 1000 - (1000 * memoryRequestBytes) / machineMemoryCapacityBytes), 999)` 计算得出 1827 | * *system-node-critical* 优先级的Pod,也设置为 -997 1828 | 1829 | 进一步阅读: 1830 | * [Node out of memory behavior](https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/#node-out-of-memory-behavior) 1831 | * [代码GetContainerOOMScoreAdjust()](https://github.com/kubernetes/kubernetes/blob/fa88c0b7796170eeff5686ae1d7d0f2f3f0df5de/pkg/kubelet/qos/policy.go#L43) 1832 | 1833 | 1834 | #### Topology Manager 1835 | 1836 | #### CPU Manager 1837 | 1838 | ##### 遍历所有Pod的cpuset配置 1839 | ```bash 1840 | # 遍历打印所有容器的cpuset配置,注意需要特权用户执行 1841 | printf "%-44s %-64s %-48s %s\n" NAMESPACE POD CONTAINER CPUSET 1842 | for cid in $(crictl ps -q); do 1843 | scope=$(find /sys/fs/cgroup/cpuset/ -name "crio-${cid}.scope") 1844 | if [ "${scope}" != "" ]; then 1845 | cpuset=$(cat ${scope}/cpuset.cpus) 1846 | cinfo=$(crictl inspect $cid | jq -r '.status | .labels["io.kubernetes.pod.namespace"] + " " + .labels["io.kubernetes.pod.name"] + " " + .labels["io.kubernetes.container.name"]') 1847 | printf "%-44s %-64s %-48s %s\n" $cinfo $cpuset 1848 | else 1849 | echo "Error: missing scope for $cid" >> /dev/stderr 1850 | fi 1851 | done 1852 | ``` 1853 | 1854 | #### Memory Manager 1855 | 1856 | #### Device Manager 1857 | Device Manager调用Device Plugin,完成扩展设备的发现、分配。 1858 | 1859 | ### 节点优雅关机 GracefulNodeShutdown 1860 | `GracefulNodeShutdown` 1861 | 1862 | ## 库函数和实操 1863 | ### 特性门featuregate 1864 | featuregate在`pkg/features/kube_features.go`中定义。 1865 | 1866 | ### 处理runtime.Object 1867 | #### 获取meta.Object信息 1868 | 方法1,将 *runtime.Object* 转成 *unstructured.Unstructured* : 1869 | ```golang 1870 | import ( 1871 | "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 1872 | "k8s.io/apimachinery/pkg/runtime" 1873 | ) 1874 | 1875 | func xxx() { 1876 | var obj runtime.Object 1877 | ... 1878 | innerObj, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj) 1879 | if err == nil { 1880 | u := &unstructured.Unstructured{Object: innerObj} 1881 | klog.Infof("%s %s", u.GroupVersionKind(), klog.KObj(u)) 1882 | } else { 1883 | klog.Infof("%v", obj) 1884 | } 1885 | ... 1886 | } 1887 | ``` 1888 | 1889 | 方法2,使用*k8s.io/apimachinery/pkg/api/meta*的*NewAccessor()* : 1890 | ```golang 1891 | import ( 1892 | "k8s.io/apimachinery/pkg/api/meta" 1893 | "k8s.io/apimachinery/pkg/runtime" 1894 | ) 1895 | 1896 | func xxx() { 1897 | var obj runtime.Object 1898 | ... 1899 | accessor := meta.NewAccessor() 1900 | kind, _ := accessor.Kind(obj) 1901 | ... 1902 | } 1903 | 1904 | // 或者,直接获取meta.Object 1905 | func yyy() { 1906 | var obj runtime.Object 1907 | ... 1908 | meta, err := meta.Accessor(obj) 1909 | if err != nil { 1910 | return "", fmt.Errorf("object has no meta: %v", err) 1911 | } 1912 | if len(meta.GetNamespace()) > 0 { 1913 | return meta.GetNamespace() + "/" + meta.GetName(), nil 1914 | } 1915 | return meta.GetName(), nil 1916 | } 1917 | ``` 1918 | 1919 | # Debug 1920 | ```bash 1921 | # 开启apiserver proxy 1922 | # 注意,因示例和debug原因开启的disable-filter选项,会带来严重的安全问题,需谨慎 1923 | # 默认端口8001 1924 | kubectl proxy --address=0.0.0.0 --disable-filter=true 1925 | 1926 | # kube-apiserver 1927 | # 浏览器打开 http://x.x.x.x:8001/debug/pprof/ 查看apiserver的pprof信息 1928 | # 获取apiserver的goroutine信息(概要) 1929 | curl http://x.x.x.x:8001/debug/pprof/goroutine?debug=1 1930 | # 或(详细信息) 1931 | curl http://x.x.x.x:8001/debug/pprof/goroutine?debug=2 1932 | # TODOTODO 1933 | 1934 | # kubelet 1935 | # 获取kubelet指标 1936 | curl http://127.0.0.1:8001/api/v1/nodes/node-x/proxy/metrics 1937 | # 保持kubelet在线运行,使用pprof分析kubelet,拿到goroutine堆栈 1938 | curl http://127.0.0.1:8001/api/v1/nodes/node-x/proxy/debug/pprof/goroutine?debug=2 1939 | # 停止kubelet进程,并打印堆栈,特别有助于定位hang住的问题 1940 | kill -s SIGQUIT 1941 | # 或者 1942 | kill -SIGABRT 1943 | # 收集heap信息 1944 | wget -O kubelet-heap.out http://127.0.0.1:8001/api/v1/nodes/node-x/proxy/debug/pprof/heap 1945 | # 收集profile信息 1946 | wget -O kubelet-profile.out http://127.0.0.1:8001/api/v1/nodes/node-x/proxy/debug/pprof/profile 1947 | 1948 | # kubelet健康检查 1949 | curl 127.0.0.1:10248/healthz 1950 | # 获取更多细节 1951 | curl -k https://127.0.0.1:10250/healthz --cacert /etc/kubernetes/keys/ca.pem --cert /etc/kubernetes/keys/kubernetes.pem --key /etc/kubernetes/keys/kubernetes-key.pem 1952 | # 或者 1953 | curl -k https://127.0.0.1:10250/healthz --cacert /etc/kubernetes/pki/ca.crt --cert /etc/kubernetes/pki/apiserver-kubelet-client.crt --key /etc/kubernetes/pki/apiserver-kubelet-client.key 1954 | 1955 | # kubelet的metrics,其中ca.crt、tls.crt和tls.key从kubeconfig中提取 1956 | curl -k https://127.0.0.1:10250/metrics --cacert ca.crt --cert tls.crt --key tls.key 1957 | 1958 | # kubelet “看到”的节点内存实际用量 1959 | NODE_IP=10.0.0.123 1960 | sudo curl -sk https://${NODE_IP}:10250/metrics/resource --cacert /etc/kubernetes/pki/ca.crt --cert /etc/kubernetes/pki/apiserver-kubelet-client.crt --key /etc/kubernetes/pki/apiserver-kubelet-client.key | grep node_memory_working_set_bytes 1961 | ``` 1962 | 1963 | | 路径 | 说明 | 1964 | |-------------------|-------------------| 1965 | | /metrics | kubelet自己的指标 | 1966 | | /metrics/cadvisor | 容器监控指标 | 1967 | | /metrics/probes | Pod的Prober指标 | 1968 | | /metrics/resource | 节点和Pod的CPU和内存资源开销 | 1969 | 1970 | 1971 | ## kube-apiserver 1972 | ```bash 1973 | # 动态调整kube-apiserver日志级别 1974 | curl -X PUT http://127.0.0.1:8001/debug/flags/v -d "4" 1975 | 1976 | # 开启proxy 1977 | kubectl proxy --address=0.0.0.0 --disable-filter=true 1978 | # 收集heap 1979 | wget -O $(hostname)-heap-$(date +"%y%m%d%H%M") http://127.0.0.1:8001/debug/pprof/heap 1980 | # 收集goroutine 1981 | curl http://127.0.0.1:8001/debug/pprof/goroutine?debug=2 >> $(hostname)-goroutine-debug2-$(date +"%y%m%d%H%M") 1982 | # 收集profile 1983 | wget -O $(hostname)-profile-$(date +"%y%m%d%H%M") http://127.0.0.1:8001/debug/pprof/profile 1984 | 1985 | # 分析pprof 1986 | go tool pprof -http :8080 *-{heap,goroutine-debug2,profile}-* 1987 | ``` 1988 | 1989 | ## kubelet 1990 | ```bash 1991 | # 动态调整kubelet日志级别,不用重启服务 1992 | NODENAME=hehe 1993 | kubectl proxy & 1994 | sleep 1s # 等待proxy端口开始监听 1995 | curl -X PUT http://127.0.0.1:8001/api/v1/nodes/${NODENAME}/proxy/debug/flags/v -d "5" 1996 | 1997 | # 收集kubelet堆栈,在/tmp目录查看堆栈文件,该操作不会导致kubelet进程重启 1998 | kill -s SIGUSR2 `pidof kubelet` 1999 | 2000 | # 使用kubectl收集 2001 | NODENAME=hehe 2002 | kubectl get --raw /api/v1/nodes/${NODENAME}/proxy/debug/pprof/heap > kubelet-heap-$NODENAME-$(date +"%Y%m%d_%H%M%S").out 2003 | kubectl get --raw /api/v1/nodes/${NODENAME}/proxy/debug/pprof/profile > kubelet-profile-$NODENAME-$(date +"%Y%m%d_%H%M%S").out 2004 | kubectl get --raw /api/v1/nodes/${NODENAME}/proxy/debug/pprof/goroutine > kubelet-goroutine-$NODENAME-$(date +"%Y%m%d_%H%M%S").out 2005 | # 查看pprof信息 2006 | go tool pprof -http :8080 xxx.out 2007 | 2008 | # 节点本地收集kubelet的profile文件 2009 | TODO 2010 | 2011 | # 查看kubelet的metrics 2012 | NODENAME=hehe 2013 | kubectl get --raw /api/v1/nodes/$NODENAME/proxy/metrics | grep go_gc_pauses_seconds_bucket 2014 | ``` 2015 | 2016 | ## kube-controller-manager 2017 | 2018 | ## kube-scheduler 2019 | 在日志中,dump出kube-scheduler的内存数据(`Dump of cached NodeInfo`): 2020 | ```bash 2021 | kill -s SIGUSR2 $(pidof kube-scheduler) 2022 | ``` 2023 | 2024 | 主要包括等待调度的Pod队列详情(`Dump of scheduling queue`),以及各节点: 2025 | * 节点名 2026 | * 已请求资源`Requested Resources` 2027 | * 可分配资源`Allocatable Resources` 2028 | * 已调度的Pod详情 2029 | 2030 | # 备忘 2031 | ## k8s版本信息 2032 | - [API Removal](https://kubernetes.io/docs/reference/using-api/deprecation-guide/) 2033 | - [API废弃策略](https://kubernetes.io/docs/reference/using-api/deprecation-policy/) 2034 | 2035 | ## 从源码编译kubernetes时版本信息 2036 | `hack/print-workspace-status.sh` 2037 | 2038 | ## 修改结构体定义后更新api-rules校验 2039 | 在修改源码中结构体定义后,需要执行如下命令,更新排除api校验规则的文件`api/api-rules/violation_exceptions.list` : 2040 | ```bash 2041 | FORCE_HOST_GO=1 make generated_files UPDATE_API_KNOWN_VIOLATIONS=true 2042 | ``` 2043 | 其中`FORCE_HOST_GO=1`强制使用主机上的go,否则默认使用`.go-version`定义的版本。 2044 | 2045 | ## 构建时如何选取version 2046 | kubernetes在构建时,根据git获取信息生成version,主要实现在 *kubernetes/hack/lib/version.sh* 中,核心是使用`git describe`: 2047 | > KUBE_GIT_VERSION=$("${git[@]}" describe --tags --match='v*' --abbrev=14 "${KUBE_GIT_COMMIT}^{commit}" 2>/dev/null) 2048 | 2049 | 再将`KUBE_GIT_VERSION`转成`semantic version`格式。 2050 | 2051 | ## StatefulSet无法更新中volumeClaimTemplates的request 2052 | * [问题讨论](https://serverfault.com/questions/955293/how-to-increase-disk-size-in-a-stateful-set) 2053 | * [社区issue](https://github.com/kubernetes/kubernetes/issues/68737) 2054 | 2055 | ## 其它 2056 | `kube-controller-manager`的默认配置在`kubernetes/pkg/controller/apis/config/v1alpha1/zz_generated.defaults.go`中`SetDefaults_KubeControllerManagerConfiguration()`设置。 2057 | -------------------------------------------------------------------------------- /docs/openshift.md: -------------------------------------------------------------------------------- 1 | # TOC 2 | 3 | 4 | * [TOC](#toc) 5 | * [Deep Dive](#deep-dive) 6 | * [SCC](#scc) 7 | * [使用oc adm policy设置scc权限的注意事项](#使用oc-adm-policy设置scc权限的注意事项) 8 | * [代码方式为sa设置scc权限](#代码方式为sa设置scc权限) 9 | * [“我”是否有这个操作权限](#我是否有这个操作权限) 10 | * [“谁”有这个操作权限](#谁有这个操作权限) 11 | * [常用操作](#常用操作) 12 | * [日志收集must-gather](#日志收集must-gather) 13 | * [Operator关键行为](#operator关键行为) 14 | * [rollout新版本](#rollout新版本) 15 | * [配置审计日志策略](#配置审计日志策略) 16 | * [检查被废弃API的使用情况](#检查被废弃api的使用情况) 17 | * [获取用户认证Token](#获取用户认证token) 18 | * [使用whoami获取Token](#使用whoami获取token) 19 | * [获取OAuth用户的Token](#获取oauth用户的token) 20 | * [从ServiceAccount中获取Token](#从serviceaccount中获取token) 21 | * [更新](#更新) 22 | * [强制更新](#强制更新) 23 | * [修改节点kubelet配置](#修改节点kubelet配置) 24 | * [让cvo不要调谐资源](#让cvo不要调谐资源) 25 | * [让cvo删除资源](#让cvo删除资源) 26 | * [强制rollout组件](#强制rollout组件) 27 | * [测试](#测试) 28 | * [执行内容查询](#执行内容查询) 29 | * [UT](#ut) 30 | * [IT](#it) 31 | * [组件e2e](#组件e2e) 32 | * [版本e2e](#版本e2e) 33 | * [OpenShift3 and OKD](#openshift3-and-okd) 34 | * [常用操作](#常用操作-1) 35 | * [官方yum源](#官方yum源) 36 | * [OpenShift 3.x DNS介绍](#openshift-3x-dns介绍) 37 | * [深入OpenShift SDN网络](#深入openshift-sdn网络) 38 | 39 | 40 | # Deep Dive 41 | ## SCC 42 | SCC基于admission control准入检查机制实现。 43 | 代码详见 [openshift/apiserver-library-go](https://github.com/openshift/apiserver-library-go/pkg/securitycontextconstraints) 。 44 | 45 | OpenShift中预置的SCC: 46 | ``` 47 | # oc get scc 48 | NAME PRIV CAPS SELINUX RUNASUSER FSGROUP SUPGROUP PRIORITY READONLYROOTFS VOLUMES 49 | anyuid false MustRunAs RunAsAny RunAsAny RunAsAny 10 false ["configMap","downwardAPI","emptyDir","persistentVolumeClaim","projected","secret"] 50 | hostaccess false MustRunAs MustRunAsRange MustRunAs RunAsAny false ["configMap","downwardAPI","emptyDir","hostPath","persistentVolumeClaim","projected","secret"] 51 | nonroot false MustRunAs MustRunAsNonRoot RunAsAny RunAsAny false ["configMap","downwardAPI","emptyDir","persistentVolumeClaim","projected","secret"] 52 | ... 53 | privileged true ["*"] RunAsAny RunAsAny RunAsAny RunAsAny false ["*"] 54 | restricted false MustRunAs MustRunAsRange MustRunAs RunAsAny false ["configMap","downwardAPI","emptyDir","persistentVolumeClaim","projected","secret"] 55 | ``` 56 | 这些SCC由`kube-apiserver-operator`设置,详见实现`cluster-kube-apiserver-operator/bindata/bootkube/scc-manifests` 。 57 | 58 | 预置的SCC中,*anyuid*具有最高优先级(PRIORITY),因此当用户有 *use scc/anyuid* 权限时(`oc adm policy who-can use securitycontextconstraints/anyuid`),会优先匹配到*anyuid*。 59 | 60 | 除*anyuid*外,其余预置scc的优先级一样。 61 | 这时按照资源/权限限制由**强**到**弱**排序,依次为pod匹配scc,按照*restricted*到*privileged*的排序。 62 | 实现详见`openshift/apiserver-library-go/pkg/securitycontextconstraints/sccadmission/admission.go`中`computeSecurityContext()`。 63 | 64 | 注意,在为pod匹配scc时,需要创建pod的用户(例如deployment的pod最终由system:serviceaccount:kube-system:replicaset-controller创建)或者pod里的ServiceAccount具备*use*该scc资源的权限,否则跳过。 65 | 这表明了在为sa赋予scc权限时,实际上就是为sa创建ClusterRoleBinding,让其能够访问scc/xxx资源。 66 | 67 | 根据scc为pod和容器设置securityContext时,mcs标签、fsGroup、uid等信息从命名空间的注解取得,例如: 68 | ``` 69 | openshift.io/sa.scc.mcs: s0:c24,c9 70 | openshift.io/sa.scc.supplemental-groups: 1000570000/10000 71 | openshift.io/sa.scc.uid-range: 1000570000/10000 72 | ``` 73 | 74 | 上述注解由*cluster-policy-controller*设置,其以容器方式运行在OpenShift的*pod/kube-controller-manager*中。扩展的控制器包括: 75 | ``` 76 | var ControllerInitializers = map[string]InitFunc{ 77 | "openshift.io/namespace-security-allocation": RunNamespaceSecurityAllocationController, 78 | "openshift.io/resourcequota": RunResourceQuotaManager, 79 | "openshift.io/cluster-quota-reconciliation": RunClusterQuotaReconciliationController, 80 | "openshift.io/cluster-csr-approver": RunCSRApproverController, 81 | "openshift.io/podsecurity-admission-label-syncer": runPodSecurityAdmissionLabelSynchronizationController, 82 | } 83 | ``` 84 | 代码详见 [cluster-policy-controller](https://github.com/openshift/cluster-policy-controller) 。 85 | 86 | ### 使用oc adm policy设置scc权限的注意事项 87 | OpenShift的SCC实际上基于RBAC实现。 88 | 89 | 需要注意如下两条命令的区别: 90 | 1. `oc adm policy add-scc-to-user privileged -n hehe -z default` 91 | 2. `oc adm policy add-scc-to-user privileged system:serviceaccount:hehe:default` 92 | 93 | 上述命令1,创建`RoleBinding`,将`Privileged`赋予了hehe命名空间下的sa/default后得到如下结果: 94 | ```bash 95 | # oc auth can-i --as=system:serviceaccount:hehe:default use scc/privileged 96 | no 97 | # oc auth can-i --as=system:serviceaccount:hehe:default use scc/privileged -n hehe 98 | yes 99 | ``` 100 | 101 | 上述命令2,创建`ClusterRoleBinding`,将`Privileged`赋予了hehe命名空间下的sa/default后得到如下结果: 102 | ```bash 103 | # oc auth can-i --as=system:serviceaccount:hehe:default use scc/privileged 104 | yes 105 | # oc auth can-i --as=system:serviceaccount:hehe:default use scc/privileged -n hehe 106 | yes 107 | ``` 108 | 109 | ### 代码方式为sa设置scc权限 110 | 根据scc实现原理,便可以通过创建`ClusterRoleBinding`为sa指定scc权限 111 | ``` 112 | apiVersion: rbac.authorization.k8s.io/v1 113 | kind: ClusterRoleBinding 114 | metadata: 115 | name: <自定义一个名字> 116 | roleRef: 117 | apiGroup: rbac.authorization.k8s.io 118 | kind: ClusterRole 119 | name: system:openshift:scc:privileged 120 | subjects: 121 | - kind: ServiceAccount 122 | name: node-problem-detector 123 | namespace: openshift-npd-system 124 | ``` 125 | 126 | ### “我”是否有这个操作权限 127 | ```bash 128 | oc auth can-i --as=system:serviceaccount:default:user use scc/privileged 129 | oc auth can-i --as=system:serviceaccount:default:user use scc/anyuid 130 | ``` 131 | 132 | ### “谁”有这个操作权限 133 | ```bash 134 | oc adm policy who-can use securitycontextconstraints/privileged 135 | oc adm policy who-can use securitycontextconstraints/anyuid 136 | ``` 137 | 138 | # 常用操作 139 | 140 | ## 日志收集must-gather 141 | 使用must-gather收集审计日志: 142 | ```bash 143 | oc adm must-gather -- /usr/bin/gather_audit_logs 144 | ``` 145 | 146 | 具体的,会拉起一个临时pod(使用must-gather镜像),这个pod里执行日志收集脚本。上面的例子中,就会执行`/usr/bin/gather_audit_logs`这个脚本。 147 | 脚本中大量使用oc/kubectl命令查询集群信息和收集日志,因此会创建临时clusterRoleBinding,为pod使用的sa临时指定cluster-admin权限。 148 | 149 | ## Operator关键行为 150 | ### rollout新版本 151 | ```bash 152 | oc logs -n openshift-kube-apiserver-operator kube-apiserver-operator-xxx | grep triggered 153 | ``` 154 | 155 | ## 配置审计日志策略 156 | [TODO](https://access.redhat.com/documentation/en-us/openshift_container_platform/4.10/html/security_and_compliance/audit-log-policy-config) 157 | 158 | ## 检查被废弃API的使用情况 159 | 参见[Navigating Kubernetes API deprecations and removals](https://access.redhat.com/articles/6955985) : 160 | ```bash 161 | # 过滤出会被废弃的API 162 | oc get apirequestcounts \ 163 | -o jsonpath='{range .items[?(@.status.removedInRelease!="")]}{.status.removedInRelease}{"\t"}{.status.requestCount}{"\t"}{.metadata.name}{"\n"}{end}' 164 | 165 | # 过滤出谁在访问被废弃的API 166 | oc get apirequestcounts ingresses.v1beta1.networking.k8s.io \ 167 | -o jsonpath='{range .status.last24h..byUser[*]}{..byVerb[*].verb}{","}{.username}{","}{.userAgent}{"\n"}{end}' \ 168 | | sort -k 2 -t, -u | column -t -s, -NVERBS,USERNAME,USERAGENT 169 | ``` 170 | 171 | 注意,如下用户可忽略,它们是内建用户、会遍历处理所有资源: 172 | - `system:serviceaccount:kube-system:generic-garbage-collector` 和 `system:serviceaccount:kube-system:namespace-controller` ,它们会遍历访问所有资源 173 | - `system:kube-controller-manager` 和 `system:cluster-policy-controller` ,它们会遍历处理所有资源 174 | 175 | 176 | ## 获取用户认证Token 177 | ### 使用whoami获取Token 178 | ```bash 179 | USER=user 180 | PASSWORD=passwd 181 | oc login -u ${USER} -p ${PASSWORD} 182 | oc whoami -t 183 | ``` 184 | 185 | ### 获取OAuth用户的Token 186 | ```bash 187 | USER=user 188 | PASSWORD=passwd 189 | OAUTH=$(oc get route oauth-openshift -n openshift-authentication -ojson | jq -r .spec.host) 190 | curl -sik "https://${OAUTH}/oauth/authorize?response_type=token&client_id=openshift-challenging-client" --user ${USER}:${PASSWORD} | grep -oP "access_token=\K[^&]*" 191 | ``` 192 | 193 | ### 从ServiceAccount中获取Token 194 | ```bash 195 | SA=user 196 | NS=test 197 | API=$(oc whoami --show-server) 198 | TOKEN=$(oc serviceaccounts get-token ${SA} -n ${NS}) 199 | # 通过TOKEN查找user 200 | oc get user '~' --token=${TOKEN} 201 | curl -H "Authorization: Bearer ${TOKEN}" -X GET -k ${API}/apis 202 | ``` 203 | 204 | ```bash 205 | ## 修改kube-controller-manager配置 206 | # 编辑kubecontrollermanager.operator.openshift.io/cluster, 207 | # 参考observedConfig,增加unsupportedConfigOverrides字段,在extendedArguments中指定希望配置的参数 208 | spec: 209 | observedConfig: 210 | extendedArguments: 211 | cluster-cidr: 212 | - 10.0.0.0/14 213 | unsupportedConfigOverrides: 214 | extendedArguments: 215 | node-monitor-grace-period: 216 | - 15s 217 | useMoreSecureServiceCA: true 218 | 219 | 220 | ## 查询监控指标 221 | secretname=$(kubectl get serviceaccount --namespace=openshift-monitoring prometheus-k8s -o jsonpath='{.secrets[1].name}') 222 | BRIDGE_K8S_AUTH_BEARER_TOKEN=$(kubectl get secret "$secretname" --namespace=openshift-monitoring -o template --template='{{.data.token}}' | base64 --decode) 223 | THANOS_QUERIER_SVC=$(kubectl get svc -n openshift-monitoring thanos-querier --no-headers | awk '{print $3}') 224 | PROM_QL='ALERTS{alertname!~"Watchdog|AlertmanagerReceiversNotConfigured|PrometheusRemoteWriteDesiredShards",alertstate="firing",severity!="info"}' 225 | 226 | curl -k -H "Authorization: Bearer $BRIDGE_K8S_AUTH_BEARER_TOKEN" \ 227 | "https://$THANOS_QUERIER_SVC:9091/api/v1/query" \ 228 | --data-urlencode "query=$PROM_QL" 229 | 230 | 231 | ## 新增pullSecret 232 | # 编辑 /var/lib/kubelet/config.json,在文件中增加auth 233 | vi /var/lib/kubelet/config.json 234 | # 重启crio服务 235 | systemctl restart crio 236 | 237 | 238 | ## 使用oc命令执行容器镜像mirror操作 239 | oc image mirror -a /var/lib/kubelet/config.json quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:ae92a919cb6da4d1a5d832f8bc486ae92e55bf3814ebab94bf4baa4c4bcde85d image.ytinirt.cn/zhaoyao/ocp4 240 | # 如果image.ytinirt.cn没有访问权限,需要把该仓库的auth追加到/var/lib/kubelet/config.json 241 | # 如果image.ytinirt.cn的CA不是权威的,可以将其CA放到 /etc/pki/ca-trust/source/anchors 目录下,并执行 update-ca-trust extract 242 | 243 | 244 | ## 使用podman操作容器镜像 245 | # 镜像导出,导出多个镜像时注意增加 -m 标识 246 | sudo podman save -m quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:8c8813c quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:a705303fa | gzip > hehe.tar.gz 247 | 248 | 249 | ## TODOTODO: podman inspect vs podman manifest inspect 250 | sudo podman manifest inspect quay.io/openshift-release-dev/ocp-release@sha256:dd71b3cd08ce1e859e0e740a585827c9caa1341819d1121d92879873a127f5e2 251 | sudo podman inspect quay.io/openshift-release-dev/ocp-release@sha256:dd71b3cd08ce1e859e0e740a585827c9caa1341819d1121d92879873a127f5e2 252 | sudo podman manifest inspect quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:ae92a919cb6da4d1a5d832f8bc486ae92e55bf3814ebab94bf4baa4c4bcde85d --log-level=debug 253 | 254 | 255 | ## 在OpenShift节点上启调试debug容器 256 | podman run --network=host -it centos bash 257 | 258 | 259 | ## 强制跳过machine-config-operator对节点的mc检查 260 | # 在希望跳过的节点上执行 261 | touch /run/machine-config-daemon-force 262 | 263 | 264 | ## 节点后台直接下载容器镜像 265 | # 配置代理,如果需要 266 | export https_proxy=http://127.0.0.1:8080/ 267 | export http_proxy=http://127.0.0.1:8080/ 268 | # 拿kubelet使用的认证信息,去下载容器镜像 269 | podman pull --authfile /v/var/lib/kubelet/config.json quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:f5628b30aa047fe32cba9308c70c581f7d9812f40a3e651a84f0532af184bfd2 270 | 271 | 272 | ## 直接操作ETCD数据 273 | # 切换为root用户,并执行如下命令 274 | source /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/etcd-scripts/etcd.env 275 | source /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/etcd-scripts/etcd-common-tools 276 | dl_etcdctl 277 | export ETCDCTL_CERT=/etc/kubernetes/static-pod-resources/etcd-certs/secrets/etcd-all-certs/etcd-peer-master0.crt 278 | export ETCDCTL_KEY=/etc/kubernetes/static-pod-resources/etcd-certs/secrets/etcd-all-certs/etcd-peer-master0.key 279 | export ETCDCTL_CACERT=/etc/kubernetes/static-pod-resources/etcd-certs/configmaps/etcd-serving-ca/ca-bundle.crt 280 | etcdctl ... 281 | 282 | 283 | ## 调用OSUS服务,获取graph的示例: 284 | curl --silent --header 'Accept:application/json' 'https://api.openshift.com/api/upgrades_info/v1/graph?arch=amd64&channel=stable-4.2' 285 | 286 | 287 | ## 对接使用htpasswd IDP 288 | # 创建用户名和密码文件 289 | htpasswd -bB users.htpasswd 290 | # 创建secret 291 | kubectl create secret generic htpass-secret --from-file=htpasswd=users.htpasswd -n openshift-config 292 | # 配置OAuth对接htpasswd IDP 293 | cat < users.htpasswd 315 | # 添加新用户 316 | htpasswd -bB users.htpasswd 317 | # 删除老用户,注意,后续需要同步删除对应的 user 和 identity 资源实例 318 | htpasswd -D users.htpasswd 319 | # 使配置生效 320 | oc create secret generic htpass-secret --from-file=htpasswd=users.htpasswd --dry-run=client -o yaml -n openshift-config | oc replace -f - 321 | 322 | 323 | ## 查看审计日志 324 | oc adm node-logs --role=master --path=kube-apiserver 325 | oc adm node-logs master0 --path=kube-apiserver/audit.log 326 | ## 收集audit审计日志 327 | oc adm must-gather --dest-dir /path/to/audit/logs/dir/ -- /usr/bin/gather_audit_logs 328 | 329 | 330 | ## 查看节点上服务日志 331 | oc adm node-logs -u crio 332 | oc adm node-logs -u kubelet 333 | 334 | 335 | ## 获取集群所有资源对象,这些资源对象由CVO创建管理 336 | # 获取当前版本的update image,实际上其也是cluster-version-operator pod使用的容器镜像 337 | oc get clusterversion -o jsonpath='{.status.desired.image}{"\n"}' version 338 | # 获取CVO管理对象的列表 339 | oc adm release extract --from=quay.io/openshift-release-dev/ocp-release@sha256:1935b6c8277e351550bd7bfcc4d5df7c4ba0f7a90165c022e2ffbe789b15574a --to=release-image 340 | # release-metadata文件携带版本元数据 341 | # image-references文件携带OpenShift集群需要的容器镜像 342 | ls release-image 343 | 344 | ## 直接提取版本镜像release image 345 | $ mkdir /tmp/release 346 | $ oc image extract quay.io/openshift-release-dev/ocp-release:4.5.1-x86_64 --path /:/tmp/release 347 | 348 | 349 | ## 让Operator/资源对象不被CVO管理,此后就能随便edit资源对象了 350 | # 查看当前的override信息 351 | oc get -o json clusterversion version | jq .spec.overrides 352 | # 为了向override中增加表项配置,需要给 clusterversion/version 打 patch 353 | # 新建.spec.overrides 354 | cat <version-patch-first-override.yaml 355 | - op: add 356 | path: /spec/overrides 357 | value: 358 | - kind: Deployment 359 | group: apps 360 | name: network-operator 361 | namespace: openshift-network-operator 362 | unmanaged: true 363 | EOF 364 | # 新增一项override 365 | cat <version-patch-add-override.yaml 366 | - op: add 367 | path: /spec/overrides/- 368 | value: 369 | kind: Deployment 370 | group: apps 371 | name: network-operator 372 | namespace: openshift-network-operator 373 | unmanaged: true 374 | EOF 375 | # 执行patch 376 | oc patch clusterversion version --type json -p "$(cat version-patch.yaml)" 377 | ## 也可以直接停掉CVO 378 | oc scale --replicas 0 -n openshift-cluster-version deployments/cluster-version-operator 379 | 380 | ``` 381 | 382 | ## 更新 383 | ### 强制更新 384 | ```bash 385 | oc adm upgrade --to-image=release-image@sha256:xxx --force --allow-explicit-upgrade 386 | ``` 387 | 388 | ## 修改节点kubelet配置 389 | 通过`kubeletConfig`修改节点kubelet配置: 390 | ```yaml 391 | apiVersion: machineconfiguration.openshift.io/v1 392 | kind: KubeletConfig 393 | metadata: 394 | name: set-node-lease-duration-worker 395 | spec: 396 | machineConfigPoolSelector: 397 | matchLabels: 398 | pools.operator.machineconfiguration.openshift.io/worker: "" 399 | kubeletConfig: 400 | nodeLeaseDurationSeconds: 15 401 | ``` 402 | 403 | ## 让cvo不要调谐资源 404 | ```bash 405 | # 当新增override项 406 | cat <patch.yaml 407 | - op: add 408 | path: /spec/overrides 409 | value: 410 | - kind: Deployment 411 | group: apps 412 | name: foo 413 | namespace: bar 414 | unmanaged: true 415 | EOF 416 | 417 | # 当更新override项 418 | cat <patch.yaml 419 | - op: add 420 | path: /spec/overrides/- 421 | value: 422 | kind: Deployment 423 | group: apps 424 | name: foo 425 | namespace: bar 426 | unmanaged: true 427 | EOF 428 | 429 | # 给clusterversion打patch 430 | oc patch clusterversion version --type json -p "$(cat patch.yaml)" 431 | ``` 432 | 433 | ## 让cvo删除资源 434 | manifests中yaml增加注解`release.openshift.io/delete: "true"` 435 | 436 | ## 强制rollout组件 437 | ```bash 438 | kubectl patch kubeapiserver cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$( date --rfc-3339=ns )"'"}}' --type=merge 439 | ``` 440 | 441 | # 测试 442 | ## 执行内容查询 443 | [链接](https://steps.ci.openshift.org/search) 444 | 445 | ## UT 446 | 447 | ## IT 448 | 449 | ## 组件e2e 450 | 451 | ## 版本e2e 452 | 453 | 454 | # OpenShift3 and OKD 455 | 456 | ## 常用操作 457 | 458 | 权限操作: 459 | 460 | ```bash 461 | oc adm policy add-scc-to-user privileged -z default -n 462 | oc adm policy add-scc-to-user anyuid -z istio-pilot-service-account -n istio-system 463 | oc adm policy add-scc-to-user anyuid -z istio-sidecar-injector-service-account -n istio-system 464 | oc adm policy add-cluster-role-to-user cluster-reader system:serviceaccount::default 465 | oc adm policy add-cluster-role-to-user cluster-reader -z default -n 466 | ``` 467 | 468 | 自定义router服务端口: 469 | 470 | ```bash 471 | oc adm policy add-scc-to-user hostnetwork -z router 472 | oc adm router router --ports='10080:10080,10443:10443' --replicas=0 --service-account=router 473 | 474 | oc edit dc/router # 修改环境变量 ROUTER_SERVICE_HTTPS_PORT 和 ROUTER_SERVICE_HTTP_PORT 475 | # 或者执行 476 | oc set env dc/router ROUTER_SERVICE_HTTP_PORT=10080 ROUTER_SERVICE_HTTPS_PORT=10443 477 | 478 | oc scale dc/router --replicas=3 479 | 480 | # 可能需要执行 481 | iptables -A INPUT -p tcp --dport 10080 -j ACCEPT 482 | iptables -A INPUT -p tcp --dport 10443 -j ACCEPT 483 | ``` 484 | 485 | 运维操作: 486 | 487 | ```bash 488 | # 用户登录 489 | oc login https://vip.cluster.local:8443 -u system:admin 490 | # 或跳过服务端证书校验 491 | oc login https://vip.cluster.local:8443 -u system:admin --insecure-skip-tls-verify 492 | 493 | # 日志查看 494 | master-logs api api # 查看apiserver的日志 495 | master-logs controllers controllers # 查看controller服务的日志 496 | 497 | # 服务重启 498 | master-restart api # 重启api服务 499 | master-restart controllers # 重启controller服务 500 | 501 | # 检查cni服务端是否正常 502 | echo 'test' | socat - UNIX-CONNECT:/var/run/openshift-sdn/cni-server.sock 503 | ``` 504 | 505 | 506 | 507 | 访问webconsole: 508 | 509 | 1. 将集群中任一节点`/etc/hosts`内的记录添加到电脑的`C:\Windows\System32\drivers\etc\hosts`中 510 | 2. 访问`https://vip.cluster.local:8443/` 511 | 3. 用户名和密码:`system/admin`或者`admin/system` 512 | 513 | 514 | 515 | 统计平台内存资源开销: 516 | 517 | ```bash 518 | ps -eo 'pid,rss,comm' | grep -i 'openshift\|hyperkube\|ovs\|origin\|etcd\|dockerd' | awk '{a+=$2}END{print a}' 519 | ``` 520 | 521 | 522 | 523 | ## 官方yum源 524 | 525 | 地址`http://mirrors.xxx.com/centos/7/paas/x86_64/openshift-origin/` 526 | 527 | ```bash 528 | cat </etc/yum.repos.d/openshift-origin.repo 529 | [openshift-origin] 530 | name=Extra Packages for Enterprise Linux 7 - $basearch 531 | baseurl= http://mirrors.xxx.com/centos/7/paas/x86_64/openshift-origin/ 532 | enabled=1 533 | gpgcheck=0 534 | EOF 535 | ``` 536 | 537 | 538 | ## OpenShift 3.x DNS介绍 539 | 代码 `origin/pkg/dns/serviceresolver.go ` 中实现skydns后端接口,用于域名(svc)到IP(clusterIP)的转换。 540 | 541 | 宿主机上运行的dnsmasq服务配置见 `/etc/dnsmasq.d/origin-dns.conf` : 542 | 1. controller(master)节点上运行master-api,监听`0.0.0.0:8053`端口,数据来自apiserver。 543 | 2. node节点上运行skydns(同master类似,直接built-in skydns),监听`127.0.0.1:53`端口,数据同样来自apiserver,`pkg/cmd/server/start/start_allinone.go:250` 544 | 3. node节点宿主机上运行dnsmasq,监听除lo口外所有接口的:53端口。后端信息来自2。 545 | 546 | 宿主机上,对dns解析请求抓包: 547 | ```bash 548 | tcpdump -i lo port 53 -nnl 549 | ``` 550 | 虽然`/etc/resolve.conf`中nameserver配置为集群网卡IP地址,但tcpdump指定抓取集群网卡时并不能抓到dns解析的报文。 551 | 552 | 553 | 554 | ## 深入OpenShift SDN网络 555 | 参考资料[理解OpenShift(3):网络之 SDN](https://www.cnblogs.com/sammyliu/p/10064450.html) 556 | 557 | 参考资料中,流程图各步骤说明: 558 | 1. cri,docker_sandbox,dockershim,执行实体origin-node 559 | 2. docker直接创建容器 560 | 3. cni pluginmanager调用openshift-sdn插件,执行实体origin-node,可执行文件openshift-sdn在/opt/cni/bin目录下 561 | 4. 请求发往cni-server,执行实体openshift-sdn pod 562 | 5. 调用ipam插件host-local(详见pkg/network/node/pod.go:497),获取ip地址和路由信息,并将这些信息直接返回给openshift-sdn插件,然后转第8步 563 | 6. 详见pkg/network/node/pod.go:497,调用m.ovs.SetUpPod(req.SandboxID, req.HostVeth, podIP, vnid) 564 | 7. 详见pkg/network/node/ovscontroller.go:267 565 | 8. openshift-sdn插件调用ip.SetHWAddrByIP和ipam.ConfigureIface设置ip地址和路由信息 566 | 567 | 各节点subnet信息(类似flanneld在etcd中保存的信息/coreos.com/network)在: 568 | ```bash 569 | [root@op-m ~]# etcdctl3 get /openshift.io/registry --prefix --keys-only 570 | /openshift.io/registry/sdnnetworks/default 571 | /openshift.io/registry/sdnsubnets/op-m 572 | /openshift.io/registry/sdnsubnets/op-s1 573 | /openshift.io/registry/sdnsubnets/op-s2 574 | 575 | [root@op-m ~]# etcdctl3 get /openshift.io/registry/sdnnetworks/default | strings 576 | /openshift.io/registry/sdnnetworks/default 577 | network.openshift.io/v1 578 | ClusterNetwork 579 | default 580 | *$bc235484-08f0-11e9-9f1d-0cda411d819b2 581 | 10.101.0.0/16 582 | 10.100.0.0/16* 583 | redhat/openshift-ovs-subnet2 584 | 10.101.0.0/16 585 | [root@op-m ~]# etcdctl3 get /openshift.io/registry/sdnsubnets/op-m | strings 586 | /openshift.io/registry/sdnsubnets/op-m 587 | network.openshift.io/v1 588 | HostSubnet 589 | op-m 590 | *$bca6bebb-08f0-11e9-9f1d-0cda411d819b2 591 | !pod.network.openshift.io/node-uid 592 | $b787a6f2-08f0-11e9-9f1d-0cda411d819bz 593 | op-m 594 | 172.25.18.233" 595 | 10.101.2.0/23 596 | ``` 597 | openshift SDN根据上述信息配置各node的subnet。 598 | openshift SDN cni-server的运行目录:/run/openshift-sdn 599 | 600 | node上kubelet服务配置`/usr/bin/hyperkube kubelet --network-plugin=cni` 601 | ```bash 602 | [root@slim-m-18-233 ~]# cat /etc/cni/net.d/80-openshift-network.conf 603 | { 604 | "cniVersion": "0.2.0", 605 | "name": "openshift-sdn", 606 | "type": "openshift-sdn" 607 | } 608 | [root@slim-m-18-233 bin]# pwd 609 | /opt/cni/bin 610 | [root@slim-m-18-233 bin]# ls 611 | host-local loopback openshift-sdn 612 | ``` 613 | 614 | openshift-sdn插件: 615 | 1. 通过IPAM获取IP地址并根据subnet地址生成默认添加的路由 616 | 2. 设置OVS(ovs-vsctl将infra容器主机端虚拟网卡加入br0,ovs-ofctl设置流表规则) 617 | 618 | 本节点网络信息位置`/var/lib/cni/networks/openshift-sdn`,例如 619 | ```bash 620 | [root@xu openshift-sdn]# cat 10.101.2.92 621 | 1cc6a193e9ea4320e0f6282d4eaa6701e12fa21ff361d720c03f6e1fe9d1b324 622 | ``` 623 | 624 | 附使用IPAM插件host-local分配IP地址的示例: 625 | ```bash 626 | echo '{ "cniVersion": "0.3.1", "name": "examplenet", "ipam": { "type": "host-local", "ranges": [ [{"subnet": "203.0.113.0/24"}], [{"subnet": "2001:db8:1::/64"}]], "dataDir": "/tmp/cni-example" } }' | CI_COMMAND=ADD CNI_CONTAINERID=example CNI_NETNS=/dev/null CNI_IFNAME=dummy0 CNI_PATH=. ./host-local 627 | ``` 628 | 629 | 进入openshift-sdn命名空间任一pod,使用如下命令查看信息: 630 | ```bash 631 | ovs-vsctl show 632 | ovs-ofctl -O OpenFlow13 dump-flows br0 633 | ovs-ofctl -O OpenFlow13 dump-tables br0 634 | ovs-ofctl -O OpenFlow13 dump-ports br0 635 | ovs-ofctl -O OpenFlow13 show br0 636 | nsenter -t <容器的PID> -n ip link 637 | iptables -t nat -s 638 | ``` 639 | 640 | 为Pod设置默认路由的地方: 641 | ```golang 642 | // pkg/network/node/pod.go:112 643 | 644 | // Generates a CNI IPAM config from a given node cluster and local subnet that 645 | // CNI 'host-local' IPAM plugin will use to create an IP address lease for the 646 | // container 647 | func getIPAMConfig(clusterNetworks []common.ClusterNetwork, localSubnet string) ([]byte, error) 648 | 649 | ``` 650 | 651 | 652 | -------------------------------------------------------------------------------- /docs/performance.md: -------------------------------------------------------------------------------- 1 | 2 | * [Linux Performance](#linux-performance) 3 | * [BPF](#bpf) 4 | 5 | 6 | # Linux Performance 7 | https://www.brendangregg.com/linuxperf.html 8 | 9 | # BPF 10 | https://ebpf.io/ -------------------------------------------------------------------------------- /images/fdisk-extend-partition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ytinirt/notes/66cba70e55b46acc093cbd2ee649ada4dcaec2dc/images/fdisk-extend-partition.png -------------------------------------------------------------------------------- /images/xfs_growfs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ytinirt/notes/66cba70e55b46acc093cbd2ee649ada4dcaec2dc/images/xfs_growfs.png -------------------------------------------------------------------------------- /tool/gh-md-toc: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Steps: 5 | # 6 | # 1. Download corresponding html file for some README.md: 7 | # curl -s $1 8 | # 9 | # 2. Discard rows where no substring 'user-content-' (github's markup): 10 | # awk '/user-content-/ { ... 11 | # 12 | # 3.1 Get last number in each row like ' ... sitemap.js.*<\/h/)+2, RLENGTH-5) 21 | # 22 | # 5. Find anchor and insert it inside "(...)": 23 | # substr($0, match($0, "href=\"[^\"]+?\" ")+6, RLENGTH-8) 24 | # 25 | 26 | gh_toc_version="0.6.1" 27 | 28 | gh_user_agent="gh-md-toc v$gh_toc_version" 29 | 30 | # 31 | # Download rendered into html README.md by its url. 32 | # 33 | # 34 | gh_toc_load() { 35 | local gh_url=$1 36 | 37 | if type curl &>/dev/null; then 38 | curl --user-agent "$gh_user_agent" -s "$gh_url" 39 | elif type wget &>/dev/null; then 40 | wget --user-agent="$gh_user_agent" -qO- "$gh_url" 41 | else 42 | echo "Please, install 'curl' or 'wget' and try again." 43 | exit 1 44 | fi 45 | } 46 | 47 | # 48 | # Converts local md file into html by GitHub 49 | # 50 | # ➥ curl -X POST --data '{"text": "Hello world github/linguist#1 **cool**, and #1!"}' https://api.github.com/markdown 51 | #

Hello world github/linguist#1 cool, and #1!

'" 52 | gh_toc_md2html() { 53 | local gh_file_md=$1 54 | URL=https://api.github.com/markdown/raw 55 | if [ ! -z "$GH_TOC_TOKEN" ]; then 56 | TOKEN=$GH_TOC_TOKEN 57 | else 58 | TOKEN="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/token.txt" 59 | fi 60 | if [ -f "$TOKEN" ]; then 61 | AUTH_HEADER=("-H" "Authorization: token $(cat $TOKEN)") 62 | fi 63 | # echo $URL 1>&2 64 | OUTPUT="$(curl -s --user-agent "$gh_user_agent" \ 65 | --data-binary @"$gh_file_md" -H "Content-Type:text/plain" "${AUTH_HEADER[@]}" \ 66 | $URL)" 67 | 68 | if [ "$?" != "0" ]; then 69 | echo "XXNetworkErrorXX" 70 | fi 71 | if [ "$(echo "${OUTPUT}" | awk '/API rate limit exceeded/')" != "" ]; then 72 | echo "XXRateLimitXX" 73 | else 74 | echo "${OUTPUT}" 75 | fi 76 | } 77 | 78 | 79 | # 80 | # Is passed string url 81 | # 82 | gh_is_url() { 83 | case $1 in 84 | https* | http*) 85 | echo "yes";; 86 | *) 87 | echo "no";; 88 | esac 89 | } 90 | 91 | # 92 | # TOC generator 93 | # 94 | gh_toc(){ 95 | local gh_src=$1 96 | local gh_src_copy=$1 97 | local gh_ttl_docs=$2 98 | local need_replace=$3 99 | 100 | if [ "$gh_src" = "" ]; then 101 | echo "Please, enter URL or local path for a README.md" 102 | exit 1 103 | fi 104 | 105 | 106 | # Show "TOC" string only if working with one document 107 | if [ "$gh_ttl_docs" = "1" ]; then 108 | 109 | echo "Table of Contents" 110 | echo "=================" 111 | echo "" 112 | gh_src_copy="" 113 | 114 | fi 115 | 116 | if [ "$(gh_is_url "$gh_src")" == "yes" ]; then 117 | gh_toc_load "$gh_src" | gh_toc_grab "$gh_src_copy" 118 | if [ "${PIPESTATUS[0]}" != "0" ]; then 119 | echo "Could not load remote document." 120 | echo "Please check your url or network connectivity" 121 | exit 1 122 | fi 123 | if [ "$need_replace" = "yes" ]; then 124 | echo 125 | echo "!! '$gh_src' is not a local file" 126 | echo "!! Can't insert the TOC into it." 127 | echo 128 | fi 129 | else 130 | local rawhtml=$(gh_toc_md2html "$gh_src") 131 | if [ "$rawhtml" == "XXNetworkErrorXX" ]; then 132 | echo "Parsing local markdown file requires access to github API" 133 | echo "Please make sure curl is installed and check your network connectivity" 134 | exit 1 135 | fi 136 | if [ "$rawhtml" == "XXRateLimitXX" ]; then 137 | echo "Parsing local markdown file requires access to github API" 138 | echo "Error: You exceeded the hourly limit. See: https://developer.github.com/v3/#rate-limiting" 139 | TOKEN="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/token.txt" 140 | echo "or place github auth token here: $TOKEN" 141 | exit 1 142 | fi 143 | local toc=`echo "$rawhtml" | gh_toc_grab "$gh_src_copy"` 144 | echo "$toc" 145 | if [ "$need_replace" = "yes" ]; then 146 | if grep -Fxq "" $gh_src && grep -Fxq "" $gh_src; then 147 | echo "Found markers" 148 | else 149 | echo "You don't have or in your file...exiting" 150 | exit 1 151 | fi 152 | local ts="<\!--ts-->" 153 | local te="<\!--te-->" 154 | local dt=`date +'%F_%H%M%S'` 155 | local ext=".orig.${dt}" 156 | local toc_path="${gh_src}.toc.${dt}" 157 | local toc_footer="" 158 | # http://fahdshariff.blogspot.ru/2012/12/sed-mutli-line-replacement-between-two.html 159 | # clear old TOC 160 | sed -i${ext} "/${ts}/,/${te}/{//!d;}" "$gh_src" 161 | # create toc file 162 | echo "${toc}" > "${toc_path}" 163 | echo -e "\n${toc_footer}\n" >> "$toc_path" 164 | # insert toc file 165 | if [[ "`uname`" == "Darwin" ]]; then 166 | sed -i "" "/${ts}/r ${toc_path}" "$gh_src" 167 | else 168 | sed -i "/${ts}/r ${toc_path}" "$gh_src" 169 | fi 170 | echo 171 | echo "!! TOC was added into: '$gh_src'" 172 | echo "!! Origin version of the file: '${gh_src}${ext}'" 173 | rm -f ${gh_src}${ext} 174 | echo "!! TOC added into a separate file: '${toc_path}'" 175 | rm -f ${toc_path} 176 | echo 177 | fi 178 | fi 179 | } 180 | 181 | # 182 | # Grabber of the TOC from rendered html 183 | # 184 | # $1 — a source url of document. 185 | # It's need if TOC is generated for multiple documents. 186 | # 187 | gh_toc_grab() { 188 | # if closed is on the new line, then move it on the prev line 189 | # for example: 190 | # was: The command foo1 191 | # 192 | # became: The command foo1 193 | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n<\/h/<\/h/g' | 194 | # find strings that corresponds to template 195 | grep -E -o '//g' | sed 's/<\/code>//g' | 198 | # now all rows are like: 199 | # ... .*<\/h/)+2, RLENGTH-5) 206 | href = substr($0, match($0, "href=\"[^\"]+?\"")+6, RLENGTH-7) 207 | print sprintf("%*s", level*3, " ") "* [" text "](" gh_url href ")" }' | 208 | sed 'y/+/ /; s/%/\\x/g')" 209 | } 210 | 211 | # 212 | # Returns filename only from full path or url 213 | # 214 | gh_toc_get_filename() { 215 | echo "${1##*/}" 216 | } 217 | 218 | # 219 | # Options hendlers 220 | # 221 | gh_toc_app() { 222 | local need_replace="no" 223 | 224 | if [ "$1" = '--help' ] || [ $# -eq 0 ] ; then 225 | local app_name=$(basename "$0") 226 | echo "GitHub TOC generator ($app_name): $gh_toc_version" 227 | echo "" 228 | echo "Usage:" 229 | echo " $app_name [--insert] src [src] Create TOC for a README file (url or local path)" 230 | echo " $app_name - Create TOC for markdown from STDIN" 231 | echo " $app_name --help Show help" 232 | echo " $app_name --version Show version" 233 | return 234 | fi 235 | 236 | if [ "$1" = '--version' ]; then 237 | echo "$gh_toc_version" 238 | echo 239 | echo "os: `lsb_release -d | cut -f 2`" 240 | echo "kernel: `cat /proc/version`" 241 | echo "shell: `$SHELL --version`" 242 | echo 243 | for tool in curl wget grep awk sed; do 244 | printf "%-5s: " $tool 245 | echo `$tool --version | head -n 1` 246 | done 247 | return 248 | fi 249 | 250 | if [ "$1" = "-" ]; then 251 | if [ -z "$TMPDIR" ]; then 252 | TMPDIR="/tmp" 253 | elif [ -n "$TMPDIR" -a ! -d "$TMPDIR" ]; then 254 | mkdir -p "$TMPDIR" 255 | fi 256 | local gh_tmp_md 257 | gh_tmp_md=$(mktemp $TMPDIR/tmp.XXXXXX) 258 | while read input; do 259 | echo "$input" >> "$gh_tmp_md" 260 | done 261 | gh_toc_md2html "$gh_tmp_md" | gh_toc_grab "" 262 | return 263 | fi 264 | 265 | if [ "$1" = '--insert' ]; then 266 | need_replace="yes" 267 | shift 268 | fi 269 | 270 | for md in "$@" 271 | do 272 | echo "" 273 | gh_toc "$md" "$#" "$need_replace" 274 | done 275 | 276 | echo "" 277 | echo "Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)" 278 | } 279 | 280 | # 281 | # Entry point 282 | # 283 | gh_toc_app "$@" 284 | -------------------------------------------------------------------------------- /tool/update-toc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TOOL_DIR=$(dirname $0) 4 | DOCS_DIR="${TOOL_DIR}/../docs" 5 | 6 | for doc in $(find ${DOCS_DIR} -name "*.md"); do 7 | ${TOOL_DIR}/gh-md-toc --insert ${doc} 8 | done 9 | --------------------------------------------------------------------------------