├── .nvmrc
├── Dockerfile
├── .DS_Store
├── README.md
├── .gitignore
├── content
└── zh
│ ├── search.md
│ ├── featured-background.jpg
│ ├── featured-background.jpeg
│ ├── docs
│ ├── tricks
│ │ ├── _index.md
│ │ ├── direct-response
│ │ │ └── _index.md
│ │ ├── perodical-access-log
│ │ │ └── _index.md
│ │ └── udp-listener
│ │ │ └── _index.md
│ ├── best-practice
│ │ ├── image
│ │ │ ├── eshop-demo-1.jpg
│ │ │ ├── eshop-demo.jpg
│ │ │ ├── screenshot1.png
│ │ │ ├── trace-screenshot-1.png
│ │ │ ├── trace-screenshot-2.png
│ │ │ ├── trace-screenshot-3.png
│ │ │ ├── trace-screenshot-4.png
│ │ │ ├── trace-screenshot-5.png
│ │ │ ├── trace-screenshot-6.png
│ │ │ ├── monolith-microserivce.jpg
│ │ │ └── tracing_mental_model.png
│ │ ├── _index.md
│ │ ├── vs-priority
│ │ │ └── _index.md
│ │ ├── graceful-termination
│ │ │ └── _index.md
│ │ ├── startup-dependence
│ │ │ └── _index.md
│ │ ├── http-header-case
│ │ │ └── _index.md
│ │ ├── internal-redirect
│ │ │ └── _index.md
│ │ ├── async-message-tracing
│ │ │ └── _index.md
│ │ └── method-level-trcing
│ │ │ └── _index.md
│ ├── common-problem
│ │ ├── image
│ │ │ ├── externalname.png
│ │ │ ├── pprof-heap-1.png
│ │ │ ├── pprof-heap-2.png
│ │ │ ├── envoy-initialize.png
│ │ │ ├── tcp-keepalive-ss-1.png
│ │ │ ├── tcp-keepalive-ss-2.png
│ │ │ ├── tcp-keepalive-ss-3.png
│ │ │ ├── tcp-keepalive-ss-4.png
│ │ │ ├── tcp-keepalive-ss-5.png
│ │ │ ├── tcp-keepalive-package.png
│ │ │ ├── request_path_dimension.png
│ │ │ └── pilot_total_rejected_configs.png
│ │ ├── unbalanced-workers
│ │ │ ├── cpu.png
│ │ │ ├── cpu-balanced
│ │ │ └── _index.md
│ │ ├── upstream-connection-termination
│ │ │ ├── debug.png
│ │ │ └── _index.md
│ │ ├── _index.md
│ │ ├── envoy-stats-memory
│ │ │ └── _index.md
│ │ ├── application-start-fail
│ │ │ └── _index.md
│ │ ├── gateway-tcp-port-not-found
│ │ │ └── _index.md
│ │ ├── external-name-service-highjacks
│ │ │ └── _index.md
│ │ ├── tls-wrong-sni
│ │ │ └── _index.md
│ │ ├── duplicate-tls-hosts
│ │ │ └── _index.md
│ │ ├── server-speaks-first-protocol
│ │ │ └── _index.md
│ │ └── tcp-keepalive
│ │ │ └── _index.md
│ ├── debug-istio
│ │ ├── envoy-log
│ │ │ ├── image
│ │ │ │ ├── envoy-model.png
│ │ │ │ ├── request-route.png
│ │ │ │ └── downstream-upstream.png
│ │ │ └── _index.md
│ │ ├── istio-debug
│ │ │ ├── image
│ │ │ │ ├── istio-debug.png
│ │ │ │ └── istio-heap.png
│ │ │ └── _index.md
│ │ ├── _index.md
│ │ └── envoy-profiler
│ │ │ └── _index.md
│ └── _index.md
│ └── _index.html
├── netlify.toml
├── .gitmodules
├── assets
└── scss
│ └── _variables_project.scss
├── docker-compose.yaml
├── go.mod
├── layouts
├── 404.html
└── partials
│ └── page-meta-lastmod.html
├── package.json
├── CONTRIBUTING.md
├── go.sum
├── config.toml
└── LICENSE
/.nvmrc:
--------------------------------------------------------------------------------
1 | lts/*
2 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM klakegg/hugo:ext-alpine
2 |
3 | RUN apk add git
4 |
--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/.DS_Store
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Istio 运维实战
2 |
3 | [在线阅读](https://istio-operation-bible.aeraki.net/)
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /public
2 | resources/
3 | node_modules/
4 | package-lock.json
5 | .hugo_build.lock
--------------------------------------------------------------------------------
/content/zh/search.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Search Results
3 | layout: search
4 |
5 | ---
6 |
7 |
--------------------------------------------------------------------------------
/netlify.toml:
--------------------------------------------------------------------------------
1 | [build]
2 | [build.environment]
3 | HUGO_VERSION = "0.96.0"
4 | GO_VERSION = "1.18"
5 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 |
2 | [submodule "themes/docsy"]
3 | path = themes/docsy
4 | url = https://github.com/google/docsy
5 |
--------------------------------------------------------------------------------
/assets/scss/_variables_project.scss:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | Add styles or override variables from the theme here.
4 |
5 | */
6 |
7 |
--------------------------------------------------------------------------------
/content/zh/featured-background.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/featured-background.jpg
--------------------------------------------------------------------------------
/content/zh/featured-background.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/featured-background.jpeg
--------------------------------------------------------------------------------
/content/zh/docs/tricks/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Istio 小技巧"
3 | linkTitle: "Istio 小技巧"
4 | weight: 90
5 | description: >
6 | 一些 Istio 小技巧
7 | ---
8 |
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/image/eshop-demo-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/best-practice/image/eshop-demo-1.jpg
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/image/eshop-demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/best-practice/image/eshop-demo.jpg
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/image/screenshot1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/best-practice/image/screenshot1.png
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/image/externalname.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/image/externalname.png
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/image/pprof-heap-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/image/pprof-heap-1.png
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/image/pprof-heap-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/image/pprof-heap-2.png
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/image/envoy-initialize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/image/envoy-initialize.png
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/unbalanced-workers/cpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/unbalanced-workers/cpu.png
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/image/trace-screenshot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/best-practice/image/trace-screenshot-1.png
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/image/trace-screenshot-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/best-practice/image/trace-screenshot-2.png
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/image/trace-screenshot-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/best-practice/image/trace-screenshot-3.png
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/image/trace-screenshot-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/best-practice/image/trace-screenshot-4.png
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/image/trace-screenshot-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/best-practice/image/trace-screenshot-5.png
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/image/trace-screenshot-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/best-practice/image/trace-screenshot-6.png
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/image/tcp-keepalive-ss-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/image/tcp-keepalive-ss-1.png
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/image/tcp-keepalive-ss-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/image/tcp-keepalive-ss-2.png
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/image/tcp-keepalive-ss-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/image/tcp-keepalive-ss-3.png
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/image/tcp-keepalive-ss-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/image/tcp-keepalive-ss-4.png
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/image/tcp-keepalive-ss-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/image/tcp-keepalive-ss-5.png
--------------------------------------------------------------------------------
/content/zh/docs/debug-istio/envoy-log/image/envoy-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/debug-istio/envoy-log/image/envoy-model.png
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/image/monolith-microserivce.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/best-practice/image/monolith-microserivce.jpg
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/image/tracing_mental_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/best-practice/image/tracing_mental_model.png
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/image/tcp-keepalive-package.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/image/tcp-keepalive-package.png
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/unbalanced-workers/cpu-balanced:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/unbalanced-workers/cpu-balanced
--------------------------------------------------------------------------------
/content/zh/docs/debug-istio/envoy-log/image/request-route.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/debug-istio/envoy-log/image/request-route.png
--------------------------------------------------------------------------------
/content/zh/docs/debug-istio/istio-debug/image/istio-debug.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/debug-istio/istio-debug/image/istio-debug.png
--------------------------------------------------------------------------------
/content/zh/docs/debug-istio/istio-debug/image/istio-heap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/debug-istio/istio-debug/image/istio-heap.png
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/image/request_path_dimension.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/image/request_path_dimension.png
--------------------------------------------------------------------------------
/content/zh/docs/debug-istio/envoy-log/image/downstream-upstream.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/debug-istio/envoy-log/image/downstream-upstream.png
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Istio 最佳实践"
3 | linkTitle: "Istio 最佳实践"
4 | weight: 3
5 | description: >
6 | 介绍用户从 Spring Cloud,Dubbo 等传统微服务框架迁移到 Istio 服务网格时的最佳实践
7 | ---
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/image/pilot_total_rejected_configs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/image/pilot_total_rejected_configs.png
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/upstream-connection-termination/debug.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhaohuabing/istio-guide/HEAD/content/zh/docs/common-problem/upstream-connection-termination/debug.png
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/vs-priority/_index.md:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | title: "正确设置 VS 中路由规则的顺序"
4 | linkTitle: "正确设置 VS 中路由规则的顺序"
5 | weight: 4
6 | date: 2022-11-07
7 | description: 正确设置 VS 中路由规则的顺序
8 | ---
9 |
10 | ## TODO
11 |
--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: "3.3"
2 |
3 | services:
4 |
5 | site:
6 | image: docsy/docsy-example
7 | build:
8 | context: .
9 | command: server
10 | ports:
11 | - "1313:1313"
12 | volumes:
13 | - .:/src
14 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/google/docsy-example
2 |
3 | go 1.12
4 |
5 | require (
6 | github.com/FortAwesome/Font-Awesome v0.0.0-20210804190922-7d3d774145ac // indirect
7 | github.com/google/docsy v0.2.0 // indirect
8 | github.com/twbs/bootstrap v4.6.1+incompatible // indirect
9 | )
10 |
--------------------------------------------------------------------------------
/layouts/404.html:
--------------------------------------------------------------------------------
1 | {{ define "main"}}
2 |
3 |
4 |
Not found
5 |
Oops! This page doesn't exist. Try going back to our home page.
6 |
7 |
You can learn how to make a 404 page like this in Custom 404 Pages.
8 |
9 |
10 | {{ end }}
11 |
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Istio 常见问题"
3 | linkTitle: "Istio 常见问题"
4 | weight: 2
5 | description: >
6 | 介绍在使用 Istio 过程中可能遇到的一些常见问题的解决方法
7 | ---
8 |
9 | Package 用于组织一组逻辑上紧密相关的 go 文件。是 go 语言中代码重用的基础单元。在文件系统中,一个 Package 对应一个文件夹,文件夹中包含该 Packag 中的多个 go 文件。在 go 语言模型中,一个 Packag 中包含了多个紧密相关的变量,结构体和方法。
10 |
11 | Package中包含的内容:
12 |
13 | ```bash
14 | └── package
15 | ├── variable
16 | ├── function
17 | └── struct
18 | ├── variable
19 | └── method
20 | ```
--------------------------------------------------------------------------------
/content/zh/docs/_index.md:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | title: "Istio 运维实战"
4 | linkTitle: "Istio 运维实战"
5 | weight: 1
6 | date: 2022-07-05
7 | menu:
8 | main:
9 | weight: 20
10 | pre:
11 | ---
12 | ## 前言
13 | 通过将微服务中原本在 SDK 中实现的应用流量管理、可见性、通信安全等服务治理能力下放到一个专门的“服务网格”基础设施中,Istio 解开了微服务的服务治理需求和业务逻辑之间的代码、编译、部署时机等的耦合,让微服务真正做到了承诺的“按需选择开发语言”,“独立部署升级”等能力,提升了微服务开发和部署的敏捷性,释放了微服务模式的生产力。
14 |
15 | 然而,“服务网格”这一基础设施的引入也给整个微服务的运维技术栈带来了新的挑战。对于运维同学来说,Istio 和 Envoy 的运维存在着较陡的学习曲线。腾讯云 TCM(Tencent Cloud Mesh)团队是业内最早一批接触服务网格技术的人员之一,有着大量 Istio/Envoy 故障排查和运维经验。本电子书记录了腾讯云 TCM 团队从大量实际案例中总结出来的 Istio 运维经验,以及使用 Istio 的最佳实践,希望对大家有所帮助。
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/content/zh/docs/debug-istio/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Istio 调试指南"
3 | linkTitle: "Istio 调试指南"
4 | weight: 1
5 | description: >
6 | 如何快速分析处理服务网格系统自身的问题
7 | ---
8 |
9 | 服务网格为微服务提供了一个服务通信的基础设施层,统一为上层的微服务提供了服务发现,负载均衡,重试,断路等基础通信功能,以及服务路由,灰度发布,Chaos 测试等高级管控功能。
10 |
11 | 服务网格的引入大大降低了个微服务应用的开发难度,让微服务应用开发人员不再需要花费大量时间用于保障底层通讯的正确性上,而是重点关注于产生用户价值的业务需求。
12 |
13 | 然而由于微服务架构的分布式架构带来的复杂度并未从系统中消失,而是从各个微服务应用中转移到了服务网格中。由服务网格对所有微服务应用的通讯进行统一控制,好处是可以保证整个系统中分布式通讯策略的一致性,并可以方便地进行集中管控。
14 |
15 | 除微服务之间分布式调用的复杂度之外,服务网格在底层通讯和微服务应用之间引入了新的抽象层,为系统引入了一些额外的复杂度。在此情况下,如果服务网格自身出现故障,将对上层的微服务应用带来灾难性的影响。
16 |
17 | 当系统中各微服务应用之间的通讯出现异常时,我们可以通过服务网格提供的分布式调用跟踪,故障注入,服务路由等手段快速进行分析和处理。但如果服务网格系统自身出现问题的话,我们如何才能快速进行分析处理呢?
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "learning-envoy",
3 | "version": "0.0.1",
4 | "description": "Istio 运维实战.",
5 | "main": "none.js",
6 | "scripts": {
7 | "test": "echo \"Error: no test specified\" && exit 1"
8 | },
9 | "repository": {
10 | "type": "git",
11 | "url": "git+https://github.com/zhaohuabing/learning-envoy.git"
12 | },
13 | "author": "Huabing Zhao",
14 | "license": "ISC",
15 | "bugs": {
16 | "url": "https://github.com/zhaohuabing/learning-envoy/issues"
17 | },
18 | "homepage": "http://zhaohuabing.com/learning-envoy",
19 | "devDependencies": {
20 | "autoprefixer": "^10.4.0",
21 | "postcss": "^8.3.7",
22 | "postcss-cli": "^9.0.2"
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/envoy-stats-memory/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Metrics 导致 Envoy 内存快速增长"
3 | linkTitle: ""
4 | weight: 14
5 | date: 2023-02-10
6 | description: 自定义 metrics 导致 Envoy 内存快速增长。
7 | ---
8 |
9 | ## 故障现象
10 |
11 | Envoy 内存快速增长,不久后即内存溢出,导致 Pod 不断重启。导出 heap 并通过 pprof 查看,发现内存中有大量 Stat Tag 相关的对象。
12 |
13 | 
14 |
15 | ## 故障原因
16 |
17 | Envoy 中 Stats Tag 用于 Metrics 上报,因此怀疑是对 metrics 的改动导致了该问题。 查看 Istiod 的 EnvoyFilter stats-filter-1.12,发现该 EnvoyFilter 中为 metrics 增加了一个 dimension ```request_path:request.path```,如下图中高亮部分所示。该配置表示在 metrics 中新增一个 tag,取值为 HTTP 请求 Header 中的 path 字段。 由于该服务请求中 path 字段包含了用户 token 等变量,导致 path 的取值范围很广,导致 envoy 中的 metrics 实例数量暴增,最终导致内存溢出。
18 |
19 | 
20 |
21 | ## 解决方案
22 |
23 | 在 EnvoyFilter 中去掉 request_path dimension,该问题即可解决。
24 |
25 | 该故障的经验教训:为 istio 数据面 metrics 增加 tag 时需要特别注意,不要随意加入取值范围较大,特别是取值为离散值的 tag。这会导致 metrics 占用的内存数量成倍增长。 例如增加一个取值范围为 10 的 tag,理论上就会导致 metrics 占用的内存增加 10 倍。
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/application-start-fail/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "应用程序启动失败"
3 | linkTitle: "应用程序启动失败"
4 | weight: 1
5 | date: 2022-07-06
6 | description: 安装了 sidecar 的应用启动失败。
7 | ---
8 |
9 | ## 故障现象
10 |
11 | 该问题的表现是安装了 Sidecar proxy 的应用在启动后的一小段时间内无法通过网络访问 Pod 外面的服务。应用在启动时通常会从一些外部服务中获取数据,并采用这些数据对自身进行初始化。例如从配置中心读取程序配置,从数据库中初始化程序用户信息等。而安装了 Sidecar proxy 的应用在启动后的一小段时间内网络是不通的。如果应用代码中没有合适的容错和重试逻辑,该问题常常会导致应用启动失败。
12 |
13 | ## 故障原因
14 |
15 | 如下图所示,Envoy 启动后会通过 xDS 协议向 Pilot 请求服务和路由配置信息,Pilot 收到请求后会根据 Envoy 所在的节点(Pod 或者 VM)组装配置信息,包括 Listener、Route、Cluster 等,然后再通过 xDS 协议下发给 Envoy。根据 Mesh 的规模和网络情况,该配置下发过程需要数秒到数十秒的时间。在这段时间内,由于初始化容器已经在 Pod 中创建了 Iptables rule 规则,因此应用向外发送的网络流量会被重定向到 Envoy ,而此时 Envoy 中尚没有对这些网络请求进行处理的监听器和路由规则,无法对此进行处理,导致网络请求失败。(关于 Envoy Sidecar 初始化过程和 Istio 流量管理原理的更多内容,可以参考这篇文章 [Istio流量管理实现机制深度解析](https://zhaohuabing.com/post/2018-09-25-istio-traffic-management-impl-intro/))。
16 |
17 | 
18 |
19 | ## 解决方案
20 |
21 | 参见:[最佳实践-在 Sidecar 初始化完成后再启动应用容器](../best-practice/startup-dependence.md)
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to to see
12 | your current agreements on file or to sign a new one.
13 |
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 |
18 | ## Code reviews
19 |
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 |
25 | ## Community Guidelines
26 |
27 | This project follows
28 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
29 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/FortAwesome/Font-Awesome v0.0.0-20210804190922-7d3d774145ac h1:AjwgwoaDsNEA1Wtc8pgw/BqG7SEk9bKxXPjEPQQ42vY=
2 | github.com/FortAwesome/Font-Awesome v0.0.0-20210804190922-7d3d774145ac/go.mod h1:IUgezN/MFpCDIlFezw3L8j83oeiIuYoj28Miwr/KUYo=
3 | github.com/google/docsy v0.2.0-pre.0.20220404161753-f7b37a0aca2a h1:bnufXLbTD8QCLbqygy/kmYxUK1JINSlHU5rLQYTcFMQ=
4 | github.com/google/docsy v0.2.0-pre.0.20220404161753-f7b37a0aca2a/go.mod h1:yuKLZHMX5CKiLUH55+ePFJaYnoSwUVVffNareaOGQYo=
5 | github.com/google/docsy v0.2.0 h1:DN6wfyyp2rXsjdV1K3wioxOBTRvG6Gg48wLPDso2lc4=
6 | github.com/google/docsy v0.2.0/go.mod h1:shlabwAQakGX6qpXU6Iv/b/SilpHRd7d+xqtZQd3v+8=
7 | github.com/google/docsy/dependencies v0.2.0-pre.0.20220404161753-f7b37a0aca2a h1:fy6IqUmWGMdQngRa7+CP1cRkTseQK7OEsqx6r7dNuSA=
8 | github.com/google/docsy/dependencies v0.2.0-pre.0.20220404161753-f7b37a0aca2a/go.mod h1:oPdn05sNt61uT6K+LqNRhYq1jeqrsbbQMDXkPdPscmA=
9 | github.com/google/docsy/dependencies v0.2.0/go.mod h1:2zZxHF+2qvkyXhLZtsbnqMotxMukJXLaf8fAZER48oo=
10 | github.com/twbs/bootstrap v4.6.1+incompatible h1:75PsBfPU1SS65ag0Z3Cq6JNXVAfUNfB0oCLHh9k9Fu8=
11 | github.com/twbs/bootstrap v4.6.1+incompatible/go.mod h1:fZTSrkpSf0/HkL0IIJzvVspTt1r9zuf7XlZau8kpcY0=
12 |
--------------------------------------------------------------------------------
/layouts/partials/page-meta-lastmod.html:
--------------------------------------------------------------------------------
1 | {{ if .Site.Params.giscus }}
2 |
15 | {{ end }}
16 | {{ if and (.GitInfo) (.Site.Params.github_repo) -}}
17 |
18 | {{ T "post_last_mod" }} {{ .Lastmod.Format .Site.Params.time_format_default -}}
19 | {{ with .GitInfo }}: {{/* Trim WS */ -}}
20 |
21 | {{- .Subject }} ({{ .AbbreviatedHash }}) {{- /* Trim WS */ -}}
22 |
23 | {{- end }}
24 |
25 | {{ end -}}
26 |
--------------------------------------------------------------------------------
/content/zh/docs/debug-istio/istio-debug/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Istio 调试端口"
3 | linkTitle: ""
4 | weight: 1
5 | date: 2022-10-11
6 | description:
7 | ---
8 |
9 |
10 | ## Istio 调试接口
11 |
12 | Istio 提供了一个调试端口 15014,通过该端口可以查看到 istiod 内部的大量调试信息,包括 istiod 的内存、cpu 使用情况,xds 缓存信息,连接到 istio 的代理,cluster 和 endpoint 信息等等。
13 |
14 | 可以通过下面的命令将 15014 端口 port forward 到 localhost:
15 |
16 | ```bash
17 | k -n istio-system port-forward svc/istiod 15014
18 | ```
19 |
20 | 通过浏览器访问 ```http://127.0.0.1:15014/debug```, 可以看到调试端口支持获取的所有信息。
21 |
22 | 
23 |
24 | ## 查看 istiod 内存占用
25 |
26 | 将 istio 调试端口 port forward 到 localhost 后,可以采用下面的命令分析 istiod 的内存使用情况:
27 |
28 | ```bash
29 | go tool pprof -http=:8080 localhost:15014/debug/pprof/heap
30 | ```
31 |
32 | 该命令以 istio 调试端口输出的 heap 信息为输入,在 8080 端口上提供了 istiod 内存的详细分析。
33 |
34 | 
35 |
36 | ## 查看 istiod 的 metrics
37 |
38 | 将 istio 调试端口 port forward 到 localhost 后,可以通过 ```http://127.0.0.1:15014/metrics ```查看 istiod 自身的 metrics 数据,以了解 istiod 的运行情况。
39 |
40 | 其中比较重要的指标有:
41 |
42 | * pilot_xds: 连接到 pilot 的 xDS 客户端数量。
43 | * pilot_xds_pushes: xds push 消息的数量。
44 | * pilot_xds_push_time: xds push 的耗时分布情况。
45 |
46 | 其他指标的含义参见: https://istio.io/latest/docs/reference/commands/pilot-discovery/#metrics
47 |
48 |
--------------------------------------------------------------------------------
/content/zh/docs/tricks/direct-response/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "directResoponse"
3 | weight: 1
4 | date: 2022-10-17
5 | description: >
6 | 代理对符合某个条件的 HTTP 请求直接返回一个响应。
7 | ---
8 |
9 | 有时候我们希望 gateway/sidecar 能直接向客户端返回一个 HTTP response,而不用交给应用程序处理。
10 |
11 | ## 1.15 版本之前
12 |
13 | 可以通过 EnvoyFilter 来修改 HTTP Route,为匹配某个条件的 HTTP 请求直接返回指定的内容。例如,下面的 EnvoyFilter 为 ingress gateway 收到的 http://*:80/direct 直接返回一个 200 response,消息体为 ```hello world```。:
14 |
15 | ```yaml
16 | apiVersion: networking.istio.io/v1alpha3
17 | kind: EnvoyFilter
18 | metadata:
19 | name: direct
20 | spec:
21 | workloadSelector:
22 | labels:
23 | istio: ingressgateway
24 | configPatches:
25 | - applyTo: HTTP_ROUTE
26 | match:
27 | context: GATEWAY
28 | routeConfiguration:
29 | portNumber: 80
30 | patch:
31 | operation: INSERT_FIRST
32 | value:
33 | name: direct
34 | match:
35 | path: /direct
36 | directResponse:
37 | body:
38 | inlineString: 'hello world'
39 | status: 200
40 | ```
41 |
42 | ## 1.15 及之后版本
43 |
44 | 1.15 版本开始,VS 支持设置 directResponse,如下所示:
45 |
46 | ```yaml
47 | apiVersion: networking.istio.io/v1alpha3
48 | kind: VirtualService
49 | metadata:
50 | name: ratings-route
51 | spec:
52 | hosts:
53 | - ratings.prod.svc.cluster.local
54 | http:
55 | - match:
56 | - uri:
57 | exact: /v1/getProductRatings
58 | directResponse:
59 | status: 503
60 | body:
61 | string: "unknown error"
62 | ...
63 | ```
64 |
65 | ## 参考文档
66 |
67 | * EnvoyFilter https://istio.io/latest/docs/reference/config/networking/envoy-filter/
68 | * HTTPDirectResponse https://istio.io/latest/docs/reference/config/networking/virtual-service/#HTTPDirectResponse
69 |
--------------------------------------------------------------------------------
/content/zh/docs/tricks/perodical-access-log/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "为 TCP 长链接周期输出访问日志"
3 | weight: 3
4 | date: 2023-05-16
5 | description: >
6 | 为 TCP 长链接周期输出访问日志。
7 | ---
8 |
9 | 缺省情况下,TCP 的访问日志只会在链接结束后再输出,对于长链接来说,会在链接建立后很长时间都无法看到访问日志。我们可以通过下面的 EnvoyFilter 来实现周期性的输出访问日志。
10 |
11 | ```yaml
12 |
13 | apiVersion: networking.istio.io/v1alpha3
14 | kind: EnvoyFilter
15 | metadata:
16 | name: periodical-access-log
17 | namespace: istio-system # apply to all sidecars
18 | spec:
19 | configPatches:
20 | - applyTo: NETWORK_FILTER
21 | match:
22 | listener:
23 | filterChain:
24 | filter:
25 | name: "envoy.filters.network.tcp_proxy"
26 | patch:
27 | operation: MERGE
28 | value:
29 | name: "envoy.filters.network.tcp_proxy"
30 | typed_config:
31 | "@type": "type.googleapis.com/envoy.extensions.filters.network.tcp_proxy.v3.TcpProxy"
32 | access_log_flush_interval: 5s
33 | ```
34 |
35 | 应用该 EnvoyFilter 后,Sidecar Proxy 每隔 5s 就会输出一次访问日志。如下所示:
36 |
37 | ```bash
38 | [2023-05-15T10:37:08.842Z] "- - -" 0 - - - "-" 3 0 - - "-" "-" "-" "-" "10.244.0.70:9080" outbound|9080||productpage.default.svc.cluster.local 10.244.0.72:41238 10.96.219.213:9080 10.244.0.72:53492 - -
39 | [2023-05-15T10:37:08.842Z] "- - -" 0 - - - "-" 3 0 - - "-" "-" "-" "-" "10.244.0.70:9080" outbound|9080||productpage.default.svc.cluster.local 10.244.0.72:41238 10.96.219.213:9080 10.244.0.72:53492 - -
40 | [2023-05-15T10:37:08.842Z] "- - -" 0 - - - "-" 3 0 - - "-" "-" "-" "-" "10.244.0.70:9080" outbound|9080||productpage.default.svc.cluster.local 10.244.0.72:41238 10.96.219.213:9080 10.244.0.72:53492 - -
41 | [2023-05-15T10:37:08.842Z] "- - -" 0 - - - "-" 3 0 - - "-" "-" "-" "-" "10.244.0.70:9080" outbound|9080||productpage.default.svc.cluster.local 10.244.0.72:41238 10.96.219.213:9080 10.244.0.72:53492 - -
42 | ```
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/unbalanced-workers/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "长链接导致 Envoy CPU 负载不均衡"
3 | linkTitle: "长链接导致 Envoy CPU 负载不均衡"
4 | weight: 11
5 | date: 2022-11-07
6 | description: Envoy 在处理长链接时 CPU 负载不均衡。
7 | ---
8 |
9 | ## 故障现象
10 |
11 | Envoy 进程使用的多个 CPU 之间的工作负载不均衡。如下图所示,Ingress Gateway 中一共有 24 个 Worker,但只有三个 worker 的 CPU 使用率较高,其他 CPU 使用率很低。
12 |
13 | 出现该问题后,虽然 CPU 还有空闲,但会由于 Envoy 的处理能力不足而导致请求积压,请求时延变长,甚至请求超时。
14 |
15 | 
16 |
17 |
18 | ## 故障原因
19 |
20 | 该问题是 [Envoy 的线程模型](https://blog.envoyproxy.io/envoy-threading-model-a8d44b922310) 导致的。 Envoy 采用多个 worker 线程(一般和 CPU core 数量相同)来接收并处理来自 downstream 的链接。一个链接创建后,该链接后面的所有处理只在一个 worker 线程中进行处理。这种线程模型保证了一个链接中的业务处理都是单线程的,简化了代码的处理逻辑。
21 |
22 | 缺省情况下,Envoy 不会在多个 worker 线程之间对链接数量进行均衡。在大部分 upstream 链接都是短链接的情况下,操作系统可以很好地将链接比较均匀地分配到多个 worker 线程上。但是,在长链接的情况下(例如 HTTP2/GRPC),多个 worker 线程分配到的链接数量可能不够均匀,就会出现有的 CPU 使用率高,有的 CPU 使用率低的情况。
23 |
24 | ## 解决方案
25 |
26 | Envoy 在 [listener](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/listener/v3/listener.proto) 的配置中提供了一个 [```connection_balance_config```](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/listener/v3/listener.proto#envoy-v3-api-msg-config-listener-v3-listener-connectionbalanceconfig) 选项来强制在多个 worker 线程之间对链接进行均匀分配。
27 |
28 | 对于大量长链接的情况,可以采用 EnvoyFilter 来启用```connection_balance_config```。 下面的 EnvoyFilter 为 Ingress Gateway 启用了 worker 链接均衡功能。
29 |
30 | ```yaml
31 | apiVersion: networking.istio.io/v1alpha3
32 | kind: EnvoyFilter
33 | metadata:
34 | name: ingress-envoy-listener-balance
35 | namespace: istio-system
36 | spec:
37 | workloadSelector:
38 | labels:
39 | istio: ingressgateway
40 | configPatches:
41 | - applyTo: LISTENER
42 | match:
43 | context: GATEWAY
44 | patch:
45 | operation: MERGE
46 | value:
47 | connection_balance_config:
48 | exact_balance: {}
49 | ```
50 |
51 | 启用上面的配置后,可以看到各个 CPU 的使用率基本相同。
52 | 
53 |
54 | 注意事项:
55 | * 如果一个 Listener 通过设置 ```use_original_dst``` 将所有链接都交给其他 Listener 处理,则建议不要在该 Listener 上设置 ```connection_balance_config```,以避免在该 Listener 上引入额外的开销。这种情况下,应该在真正处理链接的 Listener 上设置该选项。参见 [Envoy 文档中的说明](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/listener/v3/listener.proto)。
56 | * 在对链接进行 balancing 时,会在多个 worker 线程中引入一个共享锁,因此对 Envoy 创建链接的性能可能会有一定影响。
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/gateway-tcp-port-not-found/_index.md:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | title: "无法连接 gateway 上的 tcp 端口"
4 | linkTitle: "无法连接 gateway 上的 tcp 端口"
5 | weight: 10
6 | date: 2022-10-17
7 | description:
8 | ---
9 |
10 | ## 故障现象
11 |
12 | 通过 Gateway CRD 定义了一个 tcp 端口,但是无法连接到 gateway 上该端口。
13 |
14 | ## 故障原因
15 |
16 | 如果通过 Gateway 定义了一个 TCP 端口,但没有采用 VS 配置相应的路由,则会出现在 gateway 上找不到该 TCP 端口的情况。
17 |
18 | 例如,通过下面的 Gateway ,在 ingress gateway 上定义了一个 TCP 端口 8888。
19 |
20 | ```yaml
21 | apiVersion: networking.istio.io/v1beta1
22 | kind: Gateway
23 | metadata:
24 | name: ingressgw
25 | spec:
26 | selector:
27 | app: istio-ingressgateway
28 | istio: ingressgateway
29 | servers:
30 | - hosts:
31 | - '*'
32 | port:
33 | name: TCP-8888
34 | number: 8888
35 | protocol: TCP
36 | ```
37 |
38 | 此时发现通过 ingress gateway 无法访问 8888 端口,查看 ingress gateway 中的 listener 配置,找不到在 8888 上监控的 listener。
39 | ```yaml
40 | -n istio-system proxy-config listeners istio-ingressgateway-74fd488699-4v4rt
41 | ADDRESS PORT MATCH DESTINATION
42 | 0.0.0.0 15021 ALL Inline Route: /healthz/ready*
43 | 0.0.0.0 15090 ALL Inline Route: /stats/prometheus*
44 | ```
45 |
46 | 此时查看 istiod 的日志,发现有下面的错误输出:
47 |
48 | ```bash
49 | gateway omitting listener "0.0.0.0_8888" due to: must have more than 0 chains in listener "0.0.0.0_8888"
50 | ```
51 |
52 | 原因是 istiod 在试图生成 listener 时 filter chain 没有内容,导致 istiod 忽略了该 listener。
53 |
54 | ## 解决方案
55 |
56 | 采用 VS 为该 port 设置对应的路由,则 istiod 在生成 listener 时 filter chain 就不会为空,可以正常生成 listener。
57 |
58 | 创建 VS:
59 |
60 | ```yaml
61 | apiVersion: networking.istio.io/v1beta1
62 | kind: VirtualService
63 | metadata:
64 | name: ingress
65 | spec:
66 | gateways:
67 | - ingressgw
68 | hosts:
69 | - '*'
70 | tcp:
71 | - match:
72 | - port: 8888
73 | route:
74 | - destination:
75 | host: details.default.svc.cluster.local
76 | port:
77 | number: 9080
78 | ```
79 |
80 | 此时查看 ingress gateway 的配置,可以看到 8888 对应的 listener 已经成功生成:
81 |
82 | ```yaml
83 | istioctl -n istio-system proxy-config listeners istio-ingressgateway-74fd488699-4v4rt
84 | ADDRESS PORT MATCH DESTINATION
85 | 0.0.0.0 8888 ALL Cluster: outbound|9080||details.default.svc.cluster.local
86 | 0.0.0.0 15021 ALL Inline Route: /healthz/ready*
87 | 0.0.0.0 15090 ALL Inline Route: /stats/prometheus*
88 | ```
89 |
90 | > 备注:HTTP 端口和 TCP 的情况有所不同。如果采用 Gateway 定义了一个 HTTP 端口,没有配置相应的 VS,可以连接到该端口,gateway 将返回 404 HTTP 错误。
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/external-name-service-highjacks/_index.md:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | title: "ExternalName Service 劫持了其他服务流量"
4 | linkTitle: "ExternalName Service 劫持了其他服务流量"
5 | weight: 2
6 | date: 2022-07-06
7 | description:
8 | ---
9 |
10 | ## 故障现象
11 |
12 | 如果网格内存在一个 ExternalName 类型 Service, 网格内访问其他外部服务的的某一端口,如果这个端口刚好和该 ExternalName Service 重叠,那么流量会被路由到这个 ExternalName Service 对应的 CDS。
13 |
14 | ## 故障重现
15 |
16 | ### 正常情况
17 |
18 | 在 namespace sample 安装 sleep Pod:
19 |
20 | ```
21 | kubectl create ns sample
22 | kubectl label ns sample istio-injection=enabled
23 | kubectl -nsample apply -f https://raw.githubusercontent.com/istio/istio/1.11.4/samples/sleep/sleep.yaml
24 | ```
25 |
26 | 通过 sleep 访问外部服务 https://httpbin.org:443, 请求成功:
27 |
28 | ```
29 | kubectl -nsample exec sleep-74b7c4c84c-22zkq -- curl -I https://httpbin.org
30 | HTTP/2 200
31 | ......
32 | ```
33 |
34 | 从 access log 确认流量是从 PassthroughCluster 出去,符合预期:
35 |
36 | ```
37 | "- - -" 0 - - - "-" 938 5606 1169 - "-" "-" "-" "-" "18.232.227.86:443" PassthroughCluster 172.24.0.10:42434 18.232.227.86:443 172.24.0.10:42432 - -
38 | ```
39 |
40 | ### 异常情况
41 |
42 | 现在 在 default 下创建一个 ExternalName 类型的 Service, 端口也是 443:
43 |
44 | ```
45 | kind: Service
46 | apiVersion: v1
47 | metadata:
48 | name: my-externalname
49 | spec:
50 | type: ExternalName
51 | externalName: bing.com
52 | ports:
53 | - port: 443
54 | targetPort: 443
55 | ```
56 |
57 | 通过 sleep 访问外部服务 https://httpbin.org:443, 请求失败:
58 |
59 | ```
60 | kubectl -nsample exec sleep-74b7c4c84c-22zkq -- curl -I https://httpbin.org
61 | curl: (60) SSL: no alternative certificate subject name matches target host name 'httpbin.org'
62 | More details here: https://curl.se/docs/sslcerts.html
63 | ......
64 | ```
65 |
66 | 查看 access log, 发现请求外部服务,被错误路由到了 my-externalname 的 ExternalName Service:
67 |
68 | ```
69 | "- - -" 0 - - - "-" 706 5398 67 - "-" "-" "-" "-" "204.79.197.200:443" outbound|443||my-externalname.default.svc.cluster.local 172.24.0.10:56806 34.192.79.103:443 172.24.0.10:36214 httpbin.org -
70 | ```
71 |
72 | ## 故障原因
73 |
74 | 通过对比 sleep Pod 前后两次的 xDS, 发现增加了 ExternalName Service 后,xDS 里会多一个 LDS `0.0.0.0_443`, 该 LDS 包括一个`default_filter_chain` 会把该 LDS 中其他 filter chain 没有 match 到的流量,都路由到这个 `default_filter_chain` 中的 Cluster,也就是 `my-externalname` 对应的 CDS:
75 |
76 |
77 | 
78 |
79 |
80 | ## 解决方案
81 |
82 | 该问题属于 Istio 实现缺陷,相关 issue: https://github.com/istio/istio/issues/20703
83 |
84 | 目前的解决方案是避免 ExternalName Service 和其他服务端口冲突。
--------------------------------------------------------------------------------
/content/zh/docs/debug-istio/envoy-profiler/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Envoy 内存/CPU分析"
3 | linkTitle: ""
4 | weight: 3
5 | date: 2023-02-08
6 | description:
7 | ---
8 |
9 | # 导出 Enovy 的 CPU 和 内存 profile
10 | https://github.com/istio/istio/wiki/Analyzing-Istio-Performance#profile
11 |
12 | ## Profile
13 |
14 | On Istio 1.5 and older:
15 |
16 | ```bash
17 | export POD=pod-name
18 | export NS=istio-system
19 | kubectl exec -n "$NS" "$POD" -c istio-proxy -- sh -c 'sudo mkdir -p /var/log/envoy && sudo chmod 777 /var/log/envoy && curl -X POST -s "http://localhost:15000/heapprofiler?enable=y"'
20 | sleep 15
21 | kubectl exec -n "$NS" "$POD" -c istio-proxy -- sh -c 'curl -X POST -s "http://localhost:15000/heapprofiler?enable=n"'
22 | rm -rf /tmp/envoy
23 | kubectl cp -n "$NS" "$POD":/var/log/envoy/ /tmp/envoy -c istio-proxy
24 | kubectl cp -n "$NS" "$POD":/lib/x86_64-linux-gnu /tmp/envoy/lib -c istio-proxy
25 | kubectl cp -n "$NS" "$POD":/usr/local/bin/envoy /tmp/envoy/lib/envoy -c istio-proxy
26 | ```
27 |
28 | On Istio 1.6+
29 |
30 | ```bash
31 | export POD=pod-name
32 | export NS=istio-system
33 | export PROFILER="heap" # Can also be "cpu", for a cpu profile
34 | kubectl exec -n "$NS" "$POD" -c istio-proxy -- curl -X POST -s "http://localhost:15000/${PROFILER}profiler?enable=y"
35 | sleep 15
36 | kubectl exec -n "$NS" "$POD" -c istio-proxy -- curl -X POST -s "http://localhost:15000/${PROFILER}profiler?enable=n"
37 | rm -rf /tmp/envoy
38 | kubectl cp -n "$NS" "$POD":/var/lib/istio/data /tmp/envoy -c istio-proxy
39 | kubectl cp -n "$NS" "$POD":/lib/x86_64-linux-gnu /tmp/envoy/lib -c istio-proxy
40 | kubectl cp -n "$NS" "$POD":/usr/local/bin/envoy /tmp/envoy/lib/envoy -c istio-proxy
41 | ```
42 |
43 | 备注:有时候 docker cp envoy 会出错,可以采用 cat 命令
44 |
45 | ```bash
46 | kubectl -n "$NS" exec "$POD" -c istio-proxy -- cat /usr/local/bin/envoy > /tmp/envoy/lib/envoy
47 | ```
48 |
49 | ## Visualize profile pprof installation
50 |
51 | Install pprof, then run:
52 |
53 | ```bash
54 | PPROF_BINARY_PATH=/tmp/envoy/lib/ pprof -pdf /tmp/envoy/lib/envoy /tmp/envoy/envoy.prof.0001.heap
55 | ```
56 |
57 | Or, interactively
58 |
59 | ```bash
60 | PPROF_BINARY_PATH=/tmp/envoy/lib/ pprof /tmp/envoy/lib/envoy /tmp/envoy/envoy.prof.0001.heap
61 | ```
62 |
63 | Or, through the web UI
64 |
65 | ```bash
66 | PPROF_BINARY_PATH=/tmp/envoy/lib/ pprof -http=localhost:8000 /tmp/envoy/lib/envoy /tmp/envoy/envoy.prof.0001.heap
67 | ```
68 |
69 | # 采用 Envoy admin 查看内存使用情况
70 |
71 | ```bash
72 | kubectl exec -n "$NS" "$POD" -c istio-proxy -- curl "http://localhost:15000/memory"
73 | ```
74 |
75 | 输出
76 | ```bash
77 | {
78 | "allocated": "221674328",
79 | "heap_size": "361693184",
80 | "pageheap_unmapped": "86106112",
81 | "pageheap_free": "21831680",
82 | "total_thread_cache": "9805104",
83 | "total_physical_bytes": "278470656"
84 | }
85 | ```
86 |
87 | 字段含义:
88 |
89 | https://www.envoyproxy.io/docs/envoy/latest/api-v3/admin/v3/memory.proto
90 |
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/graceful-termination/_index.md:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | title: "Sidecar/Gateway 优雅退出"
4 | linkTitle: "Sidecar/Gateway 优雅退出"
5 | weight: 2
6 | date: 2022-12-01
7 | description:
8 | ---
9 |
10 | ## Istio 中 Envoy 的退出机制
11 | 缺省情况下,在收到 SIGTERM 后,Istio-agent 会在等待 terminationDrainDuration (缺省 5S)后退出,由于 Envoy 是 Istio-agent 的子进程,Envoy 也会随之退出。该缺省行为可能对于一些耗时较长的关键业务有影响,导致正在进行业务处理的链接被强制中断。
12 |
13 | ## 通过 EXIT_ON_ZERO_ACTIVE_CONNECTIONS 参数配置优雅退出
14 | Istio 1.12 版本中为 Istio-agent 引入了 EXIT_ON_ZERO_ACTIVE_CONNECTIONS 环境变量,通过该变量可以实现 Envoy 的优雅退出。当配置该变量为 true 之后,Istio-agent 会以 1S 的固定间隔检查 Envoy 中的活动链接数,当链接数量为 0 后才会退出。Istio-agent 中该部分代码如下所示:
15 |
16 | ```go
17 | // 配置了 EXIT_ON_ZERO_ACTIVE_CONNECTIONS 为 true 时,检查活动链接为 0 后再退出
18 | if a.exitOnZeroActiveConnections {
19 | log.Infof("Agent draining proxy for %v, then waiting for active connections to terminate...", a.minDrainDuration)
20 | time.Sleep(a.minDrainDuration)
21 | log.Infof("Checking for active connections...")
22 | ticker := time.NewTicker(activeConnectionCheckDelay)
23 | for range ticker.C {
24 | if a.activeProxyConnections() == 0 {
25 | log.Info("There are no more active connections. terminating proxy...")
26 | a.abortCh <- errAbort
27 | return
28 | }
29 | }
30 | } else { //缺省情况下等待 5S 即退出
31 | log.Infof("Graceful termination period is %v, starting...", a.terminationDrainDuration)
32 | time.Sleep(a.terminationDrainDuration)
33 | log.Infof("Graceful termination period complete, terminating remaining proxies.")
34 | a.abortCh <- errAbort
35 | }
36 | ```
37 |
38 | ## 配置方法
39 |
40 | ### 全局配置
41 |
42 | ```yaml
43 | meshConfig:
44 | defaultConfig:
45 | proxyMetadata:
46 | EXIT_ON_ZERO_ACTIVE_CONNECTIONS: 'true'
47 | ```
48 |
49 | ### 按 workload 单独配置
50 | 在 deploy 中通过 annotation 为 pilot-agent 添加 EXIT_ON_ZERO_ACTIVE_CONNECTIONS 环境变量。
51 | ```yaml
52 | annotations:
53 | proxy.istio.io/config: |
54 | proxyMetadata:
55 | EXIT_ON_ZERO_ACTIVE_CONNECTIONS: 'true'
56 | ```
57 |
58 | ## 配置 pod 的 terminationGracePeriodSeconds 参数
59 |
60 | Kubernetes 在向 pod 发出 SIGTERM 信号后,会缺省等待 30S,如果 30S 后 pod 还未结束,Kubernetes 会向 pod 发出 SIGKILL 信号。因此,即使设置了 EXIT_ON_ZERO_ACTIVE_CONNECTIONS 为 true,Envoy 最多也只能等待 30S,如果你的应用需要等待更长时间,则需要设置 pod 的 terminationGracePeriodSeconds 参数。下面的示例将 terminationGracePeriodSeconds 从缺省的 30S 延长到了 60S。
61 |
62 | ```yaml
63 | apiVersion: extensions/v1beta1
64 | kind: Deployment
65 | metadata:
66 | name: test
67 | spec:
68 | replicas: 1
69 | template:
70 | spec:
71 | containers:
72 | - name: test
73 | image: ...
74 | terminationGracePeriodSeconds: 60
75 | ```
76 |
77 | ## 参考链接
78 |
79 | * [Kubernetes best practices: terminating with grace](https://cloud.google.com/blog/products/containers-kubernetes/kubernetes-best-practices-terminating-with-grace)
80 | * [Istio-agent Environment variables](https://istio.io/latest/docs/reference/commands/pilot-agent/#envvars)
--------------------------------------------------------------------------------
/content/zh/_index.html:
--------------------------------------------------------------------------------
1 | +++
2 | title = "Istio 运维实战 "
3 | linkTitle = "Istio 运维实战 "
4 |
5 | +++
6 |
7 | {{< blocks/cover title="Istio 运维实战" image_anchor="top" height="full" color="orange" >}}
8 |
20 | {{< /blocks/cover >}}
21 |
22 |
23 |
86 |
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/tls-wrong-sni/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "通过 Ingress Gateway 访问集群外部服务 503 UC 错误"
3 | linkTitle: "通过 Ingress Gateway 访问集群外部服务 503 UC 错误"
4 | weight: 12
5 | date: 2022-11-23
6 | description: 当采用和外部服务的域名不同的 sni 来请求外部 https 服务时,envoy 返回 503 UC 错误。
7 | ---
8 |
9 | ## 故障现象
10 |
11 | 该使用场景比较特殊,用户通过 Ingress Gateway 来访问一个集群外部的 HTTPS 服务,Ingress Gateway 返回 503 UC 错误。
12 |
13 | 用户的访问路径如下:
14 |
15 | Browser --> Ingress Gateway(foo.bar.org) --> External Service(dev.bar.org)
16 |
17 | Ingress Gateway 配置的 VS 如下:
18 |
19 | ```yaml
20 | apiVersion: networking.istio.io/v1beta1
21 | kind: VirtualService
22 | metadata:
23 | name: nginx-vs-dev
24 | namespace: foo-dev
25 | spec:
26 | gateways:
27 | - foo-dev/barl-org
28 | hosts:
29 | - foo.bar.org
30 | http:
31 | - match:
32 | - uri:
33 | prefix: /api/test
34 | route:
35 | - destination:
36 | host: dev.bar.org
37 | port:
38 | number: 443
39 | ```
40 |
41 | 外部服务对应的 ServiceEntry 定义如下:
42 |
43 | ```yaml
44 | apiVersion: networking.istio.io/v1beta1
45 | kind: ServiceEntry
46 | metadata:
47 | name: dev-test
48 | namespace: foo-dev
49 | spec:
50 | hosts:
51 | - dev.bar.org
52 | location: MESH_EXTERNAL
53 | ports:
54 | - name: https
55 | number: 443
56 | protocol: HTTPS
57 | resolution: DNS
58 | ```
59 |
60 | Ingress Gateway 中的错误日志如下:
61 |
62 | ```json
63 | {
64 | "upstream_cluster":"outbound|443||dev.bar.org",
65 | "response_flags":"UC",
66 | "authority":"foo.bar.org",
67 | "upstream_host":"47.107.45.209:443",
68 | "bytes_sent":95,
69 | "downstream_remote_address":"182.140.153.175:2223",
70 | "downstream_local_address":"192.168.32.49:443",
71 | "upstream_transport_failure_reason":null,
72 | "istio_policy_status":null,
73 | "response_code":503,
74 | "duration":19,
75 | "request_id":"054c265b-46eb-4524-a892-810ceeb26e64",
76 | "path":"/api/test",
77 | "protocol":"HTTP/2",
78 | "requested_server_name":"foo.bar.org",
79 | "upstream_local_address":"192.168.32.49:53382",
80 | "x_forwarded_for":"182.140.153.175",
81 | "start_time":"2022-11-23T08:08:52.303Z",
82 | "upstream_service_time":null,
83 | "bytes_received":0,
84 | "user_agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
85 | "method":"GET",
86 | "route_name":null
87 | }
88 | ```
89 |
90 | ## 故障原因
91 |
92 | 用户通过 Ingress Gateway 访问时,sni 是 Ingress Gateway 的域名,即 ```foo.bar.org```,Envoy 在和 upstream 进行 tls 握手时,在没有进行配置的情况下,缺省会使用 downstream 的 sni。而该用例中,upstream 的正确 sni 应该是 ```dev.bar.org```。由于 SNI 不匹配,导致 Ingress Gateway 和 外部服务 TLS 握手失败,Ingress Gateway 报 503 UC 错误。
93 |
94 | ## 解决方案
95 |
96 | 创建下面的 DR 指定访问该外部服务时使用的 SNI。
97 |
98 | ```yaml
99 | apiVersion: networking.istio.io/v1alpha3
100 | kind: DestinationRule
101 | metadata:
102 | name: dev-test
103 | spec:
104 | host: dev.bar.org
105 | trafficPolicy:
106 | tls:
107 | mode: SIMPLE
108 | sni: dev.bar.org
109 | ```
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/startup-dependence/_index.md:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | title: "Sidecar 初始化完成后再启动应用程序"
4 | linkTitle: "Sidecar 初始化完成后再启动应用程序"
5 | weight: 1
6 | date: 2022-07-06
7 | description:
8 | ---
9 |
10 | ## 为什么需要配置 Sidecar 和应用程序的启动顺序?
11 | 在安装了 Sidecar Proxy 的 Pod 中,应用发出的外部网络请求会被 Iptables 规则重定向到 Proxy 中。如果应用发出请求时 Proxy 还未初始化完成,则 Proxy 无法对请求进行正确路由,导致请求失败。该问题导致的故障现象参见 [常见问题-应用程序启动失败/启动时无法访问网络](../common-problem/application-start-fail.md)。
12 |
13 | ## 配置方法 - Istio 1.7 及之后版本
14 | Istio 1.7 及之后的版本中,可以通过下面的方法配置在 Sidecar 初始化完成后再启动应用容器。
15 |
16 | 全局配置:
17 |
18 | 在 istio-system/istio ConfigMap 中将 `holdApplicationUntilProxyStarts` 这个全局配置项设置为 true。
19 |
20 | ```yaml
21 | apiVersion: v1
22 | data:
23 | mesh: |-
24 | defaultConfig:
25 | holdApplicationUntilProxyStarts: true
26 | ```
27 |
28 | 按 Deployment 配置:
29 |
30 | 如果不希望该配置全局生效,则可以通过下面的 annotation 在 Deployment 级别进行配置。
31 |
32 | ```yaml
33 | template:
34 | metadata:
35 | annotations:
36 | proxy.istio.io/config: '{ "holdApplicationUntilProxyStarts": true }'
37 | ```
38 |
39 | 实现原理:在开启 `holdApplicationUntilProxyStarts` 选项后,Istio Sidecar Injector Webhook 会在 Pod 中插入下面的 yaml 片段。该 yaml 片段在 Sidecar proxy 的 postStart 生命周期时间中执行了 `pilot-agent wait` 命令。该命令会检测 Proxy 的状态,待 Proxy 初始化完成后再启动 Pod 中的下一个容器。这样,在应用容器启动时,Sidecar proxy 已经完成了配置初始化,可以正确代理应用容器的对外网络请求。
40 |
41 | ```yaml
42 | spec:
43 | containers:
44 | - name: istio-proxy
45 | lifecycle:
46 | postStart:
47 | exec:
48 | command:
49 | - pilot-agent
50 | - wait
51 | ```
52 |
53 | ## 配置方法 - Istio 1.7 之前的版本
54 |
55 | Istio 1.7 之前的版本没有直接提供配置 Sidecar 和应用容器启动顺序的能力。由于 Istio 新版本中解决了老版本中的很多故障,建议尽量升级到新版本。如果由于特殊原因还要继续使用 Istio 1.7 之前的版本,可以在应用进程启动时判断 Envoy Sidecar 的初始化状态,待其初始化完成后再启动应用进程。
56 |
57 | Envoy 的健康检查接口 localhost:15020/healthz/ready 会在 xDS 配置初始化完成后才返回 200,否则将返回 503,因此可以根据该接口判断 Envoy 的配置初始化状态,待其完成后再启动应用容器。我们可以在应用容器的启动命令中加入调用 Envoy 健康检查的脚本,如下面的配置片段所示。在其他应用中使用时,将 start-awesome-app-cmd 改为容器中的应用启动命令即可。
58 |
59 | ```yaml
60 | apiVersion: apps/v1
61 | kind: Deployment
62 | metadata:
63 | name: awesome-app-deployment
64 | spec:
65 | selector:
66 | matchLabels:
67 | app: awesome-app
68 | replicas: 1
69 | template:
70 | metadata:
71 | labels:
72 | app: awesome-app
73 | spec:
74 | containers:
75 | - name: awesome-app
76 | image: awesome-app
77 | ports:
78 | - containerPort: 80
79 | command: ["/bin/bash", "-c"]
80 | args: ["while [[ \"$(curl -s -o /dev/null -w ''%{http_code}'' localhost:15020/healthz/ready)\" != '200' ]]; do echo Waiting for Sidecar;sleep 1; done; echo Sidecar available; start-awesome-app-cmd"]
81 | ```
82 |
83 | ## 解耦应用服务之间的启动依赖关系
84 |
85 | 以上配置的思路是控制 Pod 中容器的启动顺序,在 Envoy Sidecar 初始化完成后再启动应用容器,以确保应用容器启动时能够通过网络正常访问其他服务。但即使 Pod 中对外的网络访问没有问题,应用容器依赖的其他服务也可能由于尚未启动,或者某些问题而不能在此时正常提供服务。要彻底解决该问题,建议解耦应用服务之间的启动依赖关系,使应用容器的启动不再强依赖其他服务。
86 |
87 | 在一个微服务系统中,原单体应用中的各个业务模块被拆分为多个独立进程(服务)。这些服务的启动顺序是随机的,并且服务之间通过不可靠的网络进行通信。微服务多进程部署、跨进程网络通信的特定决定了服务之间的调用出现异常是一个常见的情况。为了应对微服务的该特点,微服务的一个基本的设计原则是 “design for failure”,即需要以优雅的方式应对可能出现的各种异常情况。当在微服务进程中不能访问一个依赖的外部服务时,需要通过重试、降级、超时、断路等策略对异常进行容错处理,以尽可能保证系统的正常运行。
88 |
89 | Envoy Sidecar 初始化期间网络暂时不能访问的情况只是放大了微服务系统未能正确处理服务依赖的问题,即使解决了 Envoy Sidecar 的依赖顺序,该问题依然存在。假设应用启动时依赖配置中心,配置中心是一个独立的微服务,当一个依赖配置中心的微服务启动时,配置中心有可能尚未启动,或者尚未初始化完成。在这种情况下,如果在代码中没有对该异常情况进行处理,也会导致依赖配置中心的微服务启动失败。在一个更为复杂的系统中,多个微服务进程之间可能存在网状依赖关系,如果没有按照 “design for failure” 的原则对微服务进行容错处理,那么只是将整个系统启动起来就将是一个巨大的挑战。
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/http-header-case/_index.md:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | title: "在 Istio 中指定 HTTP Header 大小写"
4 | linkTitle: "在 Istio 中指定 HTTP Header 大小写"
5 | weight: 2
6 | date: 2022-07-06
7 | description:
8 | ---
9 |
10 | # 在 Istio 中指定 HTTP Header 大小写
11 |
12 | ## 问题背景
13 |
14 | Envoy 缺省会把 HTTP Header 的 key 转换为小写,例如有一个 HTTP Header Test-Upper-Case-Header: some-value,经过 Envoy 代理后会变成 test-upper-case-header: some-value。这个在正常情况下没问题,RFC 2616 规范也说明了处理 HTTP Header 应该是大小写不敏感的。
15 |
16 | 部分场景下,业务请求对某些 Header 字段有大小写要求,此时被 Envoy 转换成为小些会导致请求出现问题。
17 |
18 | ## 解决方案
19 |
20 | Envoy 支持几种不同的 Header 规则:
21 | - 全小写(默认规则)
22 | - 首字母大写
23 |
24 | Envoy 1.8 之后新增支持:
25 | - 保留请求原本样式
26 |
27 | 基于以上能力,为了解决 Header 默认改为小写的问题在 Istio 1.8 及之前可配置成为首字母大写形式,Istio 1.10 及以后可以配置保留 Header 原有样式。
28 |
29 | ## 配置方法
30 |
31 | Istio 1.8 之前可添加如下 EnvoyFilter 配置:
32 | ```yaml
33 | apiVersion: networking.istio.io/v1alpha3
34 | kind: EnvoyFilter
35 | metadata:
36 | name: http-header-proper-case-words
37 | namespace: istio-system
38 | spec:
39 | configPatches:
40 | - applyTo: CLUSTER
41 | match:
42 | context: SIDECAR_OUTBOUND
43 | cluster:
44 | # 集群名称可通过 ConfigDump 查询
45 | name: "outbound|3000||test2.default.svc.cluster.local"
46 | patch:
47 | operation: MERGE
48 | value:
49 | http_protocol_options:
50 | header_key_format:
51 | proper_case_words: {}
52 | ```
53 | 在需要依赖大写 Header 的服务对应的集群中添加规则,将 Header 全部转为首字母大写的形式。
54 |
55 | Istio 1.10 及之后可以添加如下 EnvoyFilter 配置:
56 | ```yaml
57 | apiVersion: networking.istio.io/v1alpha3
58 | kind: EnvoyFilter
59 | metadata:
60 | name: http-header-proper-case-words
61 | namespace: istio-system
62 | spec:
63 | configPatches:
64 | # 配置保留发向 upstream 的 request header 大小写
65 | - applyTo: CLUSTER
66 | patch:
67 | operation: MERGE
68 | value:
69 | typed_extension_protocol_options:
70 | envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
71 | '@type': type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
72 | use_downstream_protocol_config:
73 | http_protocol_options:
74 | header_key_format:
75 | stateful_formatter:
76 | name: preserve_case
77 | typed_config:
78 | '@type': type.googleapis.com/envoy.extensions.http.header_formatters.preserve_case.v3.PreserveCaseFormatterConfig
79 | # 配置保留收到的 response header 大小写
80 | - applyTo: NETWORK_FILTER
81 | match:
82 | listener:
83 | filterChain:
84 | filter:
85 | name: envoy.filters.network.http_connection_manager
86 | patch:
87 | operation: MERGE
88 | value:
89 | typed_config:
90 | '@type': type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
91 | http_protocol_options:
92 | header_key_format:
93 | stateful_formatter:
94 | name: preserve_case
95 | typed_config:
96 | '@type': type.googleapis.com/envoy.extensions.http.header_formatters.preserve_case.v3.PreserveCaseFormatterConfig
97 |
98 | ```
99 | 通过此配置可以让 Envoy 保持 Header 原有大小写形式。
100 |
101 | Envoy 文档中对此的说明: https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_conn_man/header_casing#config-http-conn-man-header-casing
102 |
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/duplicate-tls-hosts/_index.md:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | title: "Gateway TLS hosts 冲突导致配置被拒绝"
4 | linkTitle: "Gateway TLS hosts 冲突导致配置被拒绝"
5 | weight: 3
6 | date: 2022-07-06
7 | description:
8 | ---
9 |
10 | ## 故障现象
11 | 网格中同时存在以下两个 Gateway
12 | ```yaml
13 | apiVersion: networking.istio.io/v1beta1
14 | kind: Gateway
15 | metadata:
16 | name: test1
17 | spec:
18 | selector:
19 | istio: ingressgateway
20 | servers:
21 | - hosts:
22 | - test1.example.com
23 | port:
24 | name: https
25 | number: 443
26 | protocol: HTTPS
27 | tls:
28 | credentialName: example-credential
29 | mode: SIMPLE
30 | ---
31 | apiVersion: networking.istio.io/v1beta1
32 | kind: Gateway
33 | metadata:
34 | name: test2
35 | spec:
36 | selector:
37 | istio: ingressgateway
38 | servers:
39 | - hosts:
40 | - test1.example.com
41 | - test2.example.com
42 | port:
43 | name: https
44 | number: 443
45 | protocol: HTTPS
46 | tls:
47 | credentialName: example-credential
48 | mode: SIMPLE
49 | ```
50 |
51 | 172.18.0.6 为 ingress gateway Pod IP,请求 https://test1.example.com 正常返回 404
52 | ```bash
53 | curl -i -HHost:test1.example.com --resolve "test1.example.com:443:172.18.0.6" --cacert example.com.crt "https://test1.example.com"
54 | HTTP/2 404
55 | date: Mon, 29 Nov 2021 06:59:26 GMT
56 | server: istio-envoy
57 | ```
58 |
59 | 请求 https://test2.example.com 异常
60 | ```bash
61 | $ curl -HHost:test2.example.com --resolve "test2.example.com:443:172.18.0.6" --cacert example.com.crt "https://test2.example.com"
62 | curl: (35) OpenSSL SSL_connect: Connection reset by peer in connection to test2.example.com:443
63 | ```
64 | ## 故障原因
65 |
66 | 通过 istiod 监控发现`pilot_total_rejected_configs`指标异常,显示`default/test2`配置被拒绝
67 | 
68 | 调整 istiod 日志级别查看被拒绝的原因
69 | ```
70 | --log_output_level=model:debug
71 | ```
72 | ```
73 | 2021-11-29T07:24:21.703924Z debug model skipping server on gateway default/test2, duplicate host names: [test1.example.com]
74 | ```
75 | 通过日志定位到具体代码位置
76 | ```go
77 | if duplicateHosts := CheckDuplicates(s.Hosts, tlsHostsByPort[resolvedPort]); len(duplicateHosts) != 0 {
78 | log.Debugf("skipping server on gateway %s, duplicate host names: %v", gatewayName, duplicateHosts)
79 | RecordRejectedConfig(gatewayName)
80 | continue
81 | }
82 | ```
83 | ```go
84 | // CheckDuplicates returns all of the hosts provided that are already known
85 | // If there were no duplicates, all hosts are added to the known hosts.
86 | func CheckDuplicates(hosts []string, knownHosts sets.Set) []string {
87 | var duplicates []string
88 | for _, h := range hosts {
89 | if knownHosts.Contains(h) {
90 | duplicates = append(duplicates, h)
91 | }
92 | }
93 | // No duplicates found, so we can mark all of these hosts as known
94 | if len(duplicates) == 0 {
95 | for _, h := range hosts {
96 | knownHosts.Insert(h)
97 | }
98 | }
99 | return duplicates
100 | }
101 | ```
102 | 校验逻辑是每个域名在同一端口上只能配置一次 TLS,我们这里 test1.example.com 在 2 个 Gateway 的 443 端口都配置了 TLS,
103 | 导致其中一个被拒绝,通过监控确认被拒绝的是 test2,test2.example.com 和 test1.example.com 配置在 test2 的同一个 Server,Server 配置被拒绝导致请求异常
104 |
105 | ## 解决方案
106 | 同一个域名不要在多个 Gateway 中的同一端口重复配置 TLS,这里我们删除 test1 后请求恢复正常
107 | ```bash
108 | $ curl -i -HHost:test1.example.com --resolve "test1.example.com:443:172.18.0.6" --cacert example.com.crt "https://test1.example.com"
109 | HTTP/2 404
110 | date: Mon, 29 Nov 2021 07:43:40 GMT
111 | server: istio-envoy
112 |
113 | $ curl -i -HHost:test2.example.com --resolve "test2.example.com:443:172.18.0.6" --cacert example.com.crt "https://test2.example.com"
114 | HTTP/2 404
115 | date: Mon, 29 Nov 2021 07:43:41 GMT
116 | server: istio-envoy
117 | ```
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/server-speaks-first-protocol/_index.md:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | title: "Server Speaks First 协议访问失败"
4 | linkTitle: "Server Speaks First 协议访问失败"
5 | weight: 4
6 | date: 2022-07-06
7 | description:
8 | ---
9 |
10 | ## 故障现象
11 |
12 | Istio 网格开启 allow any 访问模式,在一个注入了 sidecar 的 pod 内,mysql 客户端访问 mysql-ip-1:3306 成功,访问 mysql-ip-2:10000 没有响应:
13 |
14 | ```
15 | # mysql -h55.135.153.1 -utest -pxxxx -P3306
16 | Welcome to the MariaDB monitor. Commands end with ; or \g.
17 |
18 | # mysql -h55.108.108.2 -utest -pxxxx -P10000
19 | (no response)
20 | ```
21 |
22 | ## 故障分析
23 |
24 | 查看日志,把 access log 设置为 debug、trace 均没有发现有用信息。
25 |
26 | 分析发现,网格内有一个 http server,也使用了和 mysql-ip-2 相同的端口 10000:
27 |
28 | ```
29 | apiVersion: v1
30 | kind: Service
31 | metadata:
32 | name: irrelevant-svc
33 | ......
34 | spec:
35 | ports:
36 | - name: http
37 | nodePort: 31025
38 | port: 10000 # 端口相同
39 | protocol: TCP
40 | targetPort: 8080
41 | ```
42 |
43 | 我们尝试把该服务端口改成 10001,访问 mysql-ip-2:10000 成功,推测和端口冲突相关:
44 |
45 | ```
46 | # mysql -h55.108.108.2 -utest -pxxxx -P10000
47 | Welcome to the MariaDB monitor. Commands end with ; or \g.
48 | ```
49 |
50 | 我们再尝试对 mysql-ip-1 复现故障:在网格内创建了一个包括 3306 端口的 http 服务,mysql 请求无响应,问题复现。
51 |
52 | 另外我们还尝试过,如果把冲突端口的协议定义为 tcp(通过 port name),该问题不存在:
53 |
54 | ```
55 | apiVersion: v1
56 | kind: Service
57 | metadata:
58 | name: irrelevant-svc
59 | ......
60 | spec:
61 | ports:
62 | - name: tcp # 如果是 tcp 则不会出问题
63 | nodePort: 31025
64 | port: 10000
65 | protocol: TCP
66 | targetPort: 8080
67 | ```
68 |
69 | ## 故障原因
70 |
71 | ### Server Speaks First
72 |
73 | Mysql 协议是一种 **Server Speaks First** 协议,也就是说 client 和 server 完成三次握手后,是 server 会先发起会话, 简要过程:
74 |
75 | ```
76 | S: 服务端首先会发一个握手包到客户端
77 | C: 客户端向服务端发送认证信息 ( 用户名,密码等 )
78 | S: 服务端收到认证包后,会检查用户名与密码是否合法,并发送包告知客户端认证信息。
79 | ```
80 |
81 | 除了 Mysql,常见的 Server Speaks First 协议还包括 SMTP,DNS,MongoDB 等。下面是一个 SMTP 交互流程:
82 |
83 | ```
84 | S: 220 smtp.example.com ESMTP Postfi
85 | C: HELO relay.example.com
86 | S: 250 smtp.example.com, I am glad to meet you
87 | C: MAIL FROM:
88 | S: 250 Ok
89 | C: RCPT TO:
90 | S: 250 Ok
91 | C: RCPT TO:
92 | S: 250 Ok
93 | C: DATA
94 | S: 354 End data with .
95 | C: From: "Bob Example"
96 | C: To: Alice Example
97 | C: Cc: theboss@example.com
98 | C: Date: Tue, 15 Jan 2008 16:02:43 -0500
99 | C: Subject: Test message
100 | C:
101 | C: Hello Alice.
102 | C: This is a test message with 5 header fields and 4 lines in the message body.
103 | C: Your friend,
104 | C: Bob
105 | C: .
106 | S: 250 Ok: queued as 12345
107 | C: QUIT
108 | S: 221 Bye
109 | {The server closes the connection}
110 | ```
111 |
112 | ### istio 不是完全透明
113 |
114 | 当前 istio 的某些特性,不能做到**透明**兼容 Server Speaks First 协议,这些特性包括:
115 |
116 | * 协议嗅探
117 | * PERMISSIVE mTLS
118 | * Authorization Policy
119 |
120 | 这些特性都希望 client 能先发起会话,以协议嗅探为例,envoy 是通过分析 client 发出的初始若干字节来推测协议类型。
121 |
122 | 对于 Server Speaks First 协议,比如 mysql,三次握手后,这时候 mysql client 在等待 mysql server 发起初次会话,而 client 端的 envoy 尝试做协议嗅探,也在等 mysql client 发出数据,这类似一个死锁,最终超时。
123 |
124 |
125 | ## 解决方案
126 |
127 | 以下是一些可行的方案:
128 |
129 | 1. 为 Server Speaks First 协议服务创建一个 ServiceEntry,并指定协议为 TCP。
130 | 2. 避免 Server Speaks First 协议服务端口和网格内服务端口重叠,这样请求可以直接走 passthrough。
131 | 3. 把 Server Speaks First 服务 ip 放到 excludeIPRanges,这样请求不经过 envoy 处理,适用于 DB 服务不需要网格治理的情况。
132 |
133 |
134 | ## 参考资料
135 |
136 | * [Server First Protocols](https://istio.io/latest/docs/ops/deployment/requirements/#server-first-protocols)
137 | * [Server-first TCP protocols are not supported](https://istio.io/latest/docs/ops/best-practices/security/#server-first-tcp-protocols-are-not-supported)
138 | * [Istio Envoy passthrough goes wrong when port 80 are used for SMTP protocol instead of standard ports](https://www.linkedin.com/pulse/istio-envoy-passthrough-goes-wrong-when-port-80-used-smtp-liu-)
139 | * [Server-Speaks-First 有点坑](https://www.cnblogs.com/hacker-linner/p/15122404.html)
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/upstream-connection-termination/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "503 UC upstream_reset_before_response_started"
3 | linkTitle: "503 UC upstream_reset_before_response_started"
4 | weight: 13
5 | date: 2023-01-12
6 | description: Upstream 断开链路导致 503 UC。
7 | ---
8 |
9 | ## 故障现象
10 |
11 | 客户端直接访问服务器正常,但在 service mesh 中经过 envoy 访问服务器则会出现一定几率的 503 错误。查看客户端侧 envoy 的访问日志,发现日志中有下面的异常信息:
12 |
13 | ```bash
14 | [2023-01-05T04:21:37.764Z] "POST /foo/bar" 503 UC upstream_reset_before_response_started{connection_termination} - "-" 291 95 0 - "116.211.195.11,116.211.195.11" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36" "06a39679-a8d4-47f7-baf3-d688ea3e67c4" "foo.bar.com" "30.183.173.155:1984" outbound|1984||foor-service.bar-ns.svc.cluster.local 30.169.11.123:46894 30.169.11.123:443 116.211.195.11:21663 - -
15 | ```
16 |
17 | ## 故障原因
18 |
19 | 从访问日志中 `503 UC upstream_reset_before_response_started{connection_termination}` 的输出,我们可以初步推断出 503 的原因是连接被 upstream 侧中断了。
20 |
21 | 通过 Envoy 管理端口打开 debug 日志,可以看到在出现 503 UC 时,envoy 从 connection pool 中拿出了一个 upstream 的连接,但拿出该连接后,envoy 打印了一个 "remote close" 日志,说明该连接被对端关闭了。
22 |
23 | 
24 |
25 | Envoy 的 HTTP Router 会在第一次和 Upstream 建立 TCP 连接并使用后将连接释放到一个连接池中,而不是直接关闭该连接。这样下次 downstream 请求相同的 Upstream host 时可以重用该连接,可以避免频繁创建/关闭连接带来的开销。
26 |
27 | 当连接被 Envoy 放入连接池后,连接中不再转发来着 downstream 数据,即连接处于空闲状态。连接对端的应用程序会检查连接的空闲状态,并在空闲期间通过 [TCP keepalive packet](https://tldp.org/HOWTO/html_single/TCP-Keepalive-HOWTO/#whatis) 来侦测对端状态。由于空闲的连接也会占用资源,因此应用并不会无限制地在一个空闲连接上进行等待。几乎所有语言/框架在创建 TCP 服务器时都会设置一个 keepalive timeout 选项,如果在 keepalive timeout 的时间内没有收到新的 TCP 数据包,应用就会关闭该连接。
28 |
29 | 在应用端关闭连接后的极短时间内,Envoy 侧尚未感知到该连接的状态变化,如果此时 Envoy 收到了来着 downstream 的请求并将该连接从连接池中取出来使用,就会出现 `503 UC upstream_reset_before_response_started{connection_termination}` 异常。
30 |
31 | ## 解决方案
32 |
33 | ### 方案一
34 |
35 | 增大服务器端 TCP keepalive timeout 的时间间隔可以减少该问题出现的几率。该问题在 nodejs 应用中出现得较多,原因是 [nodejs 的缺省超时时间较短,只有 5 秒钟](https://nodejs.org/api/http.html#serverkeepalivetimeout),因此在 Envoy 连接池中取出的连接有较大几率刚好被对端的 nodejs 应用关闭了。
36 |
37 | > Timeout in milliseconds. Default: 5000 (5 seconds).
38 | The number of milliseconds of inactivity a server needs to wait for additional incoming data, after it has finished writing the last response, before a socket will be destroyed. If the server receives new data before the keep-alive timeout has fired, it will reset the regular inactivity timeout, i.e., server.timeout.
39 |
40 | 通过下面的方法可以在服务器端将 nodejs 的 keepalive tiemout 时间延长为 6 分钟。
41 |
42 | ```node
43 | const server = app.listen(port, '0.0.0.0', () => {
44 | logger.info(`App is now running on http://localhost:${port}`)
45 | })
46 | server.keepAliveTimeout = 1000 * (60 * 6) // 6 minutes
47 | ```
48 |
49 | 其他语言的设置方法:
50 |
51 | Python
52 | ```python
53 | global_config = {
54 | 'server.socket_timeout': 6 * 60,
55 | }
56 | cherrypy.config.update(global_config)
57 | ```
58 |
59 | Go
60 | ```go
61 | var s = http.Server{
62 | Addr: ":8080",
63 | Handler: http.HandlerFunc(Index),
64 | IdleTimeout: 6 * time.Minute,
65 | }
66 | s.ListenAndServe()
67 | ```
68 |
69 | ### 方案二
70 |
71 | 通过方案一可以减少 503 UC 出现的频率,但理论上无论 keepalive timeout 设置为多大,都有出现 503 UC的几率。而且我们也需要将 timeout 设置为一个合理的值,而不是无限大。要彻底解决该问题,可以采用 Virtual Service 为出现该问题的服务设置重试策略,在重试策略的 retryOn 中增加 `reset` 条件。
72 |
73 | 备注:
74 | Istio 缺省为服务设置了重试策略,但缺省的重试策略中并不会对连接重置这种情况进行重试。
75 |
76 | ```yaml
77 | apiVersion: networking.istio.io/v1alpha3
78 | kind: VirtualService
79 | metadata:
80 | name: ratings-route
81 | spec:
82 | hosts:
83 | - ratings.prod.svc.cluster.local
84 | http:
85 | - route:
86 | - destination:
87 | host: ratings.prod.svc.cluster.local
88 | subset: v1
89 | retries:
90 | attempts: 3
91 | retryOn: reset,connect-failure,refused-stream,unavailable,cancelled,retriable-status-codes
92 | ```
93 |
94 | ## 参考文档
95 |
96 | * [Istio: 503's with UC's and TCP Fun Times](https://karlstoney.com/2019/05/31/istio-503s-ucs-and-tcp-fun-times/)
97 | * [Envoy intermittently responds with 503 UC (upstream_reset_before_response_started{connection_termination})](https://github.com/envoyproxy/envoy/issues/14981)
98 | * [TCP-Keepalive-HOWTO](https://tldp.org/HOWTO/html_single/TCP-Keepalive-HOWTO/)
99 | * [Istio Virtual Service Retry Policy](https://istio.io/latest/docs/reference/config/networking/virtual-service/#HTTPRetry)
100 |
--------------------------------------------------------------------------------
/content/zh/docs/tricks/udp-listener/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "支持 UDP Listener"
3 | weight: 1
4 | date: 2023-04-11
5 | description: >
6 | 在 Ingress Gateway 上对外提供 UDP 服务。
7 | ---
8 |
9 | Istio 并不会处理 UDP 类型的服务,当我们需要在 Ingress Gateway 上对外提供 UDP 服务时,可以通过 EnvoyFilter 来实现。
10 |
11 | ## 创建用于测试的 UDP 服务
12 |
13 | 创建一个 coredns,用于作为后端的测试 UDP 服务。
14 |
15 | ```bash
16 | kubectl apply -f - <> DiG 9.18.1-1ubuntu1.3-Ubuntu <<>> @10.244.0.20 -p 53 foo.bar.com
108 | ; (1 server found)
109 | ;; global options: +cmd
110 | ;; Got answer:
111 | ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 48665
112 | ;; flags: qr aa rd; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 3
113 | ;; WARNING: recursion requested but not available
114 |
115 | ;; OPT PSEUDOSECTION:
116 | ; EDNS: version: 0, flags:; udp: 1232
117 | ; COOKIE: 79265989d39004a6 (echoed)
118 | ;; QUESTION SECTION:
119 | ;foo.bar.com. IN A
120 |
121 | ;; ADDITIONAL SECTION:
122 | foo.bar.com. 0 IN A 10.244.0.21
123 | _udp.foo.bar.com. 0 IN SRV 0 0 37336 .
124 |
125 | ;; Query time: 1 msec
126 | ;; SERVER: 10.244.0.20#53(10.244.0.20) (UDP)
127 | ;; WHEN: Tue Apr 11 02:41:01 UTC 2023
128 | ;; MSG SIZE rcvd: 114
129 | ```
130 |
131 | ## 通过 EnvoyFilter 在 Ingress Gateway 创建 UDP Listener 和 对应的 Cluster
132 |
133 | EnvoyFilter 如下所示,该 EnvoyFilter 在 Ingress Gateway 上创建了一个 UDP Listener,该 UDP Listener 在 5300 端口上监听来自客户端的请求,并将请求转发到后端的 Coredns 服务上。
134 |
135 | > 备注:
136 | 此处的 EnvoyFilter 中硬编码了 Cluster 中 Endpoint 地址。由于 UDP 服务的 pod 地址会变化,因此在实际使用时,我们需要编写一个 Controller 来监听 UDP 服务,以动态生成该 EnvoyFilter。
137 |
138 | ```yaml
139 | apiVersion: networking.istio.io/v1alpha3
140 | kind: EnvoyFilter
141 | metadata:
142 | name: udp-listener
143 | namespace: istio-system
144 | spec:
145 | workloadSelector:
146 | labels:
147 | istio: ingressgateway
148 | configPatches:
149 | - applyTo: LISTENER
150 | match:
151 | context: GATEWAY
152 | patch:
153 | operation: ADD
154 | value:
155 | name: udp_listener
156 | address:
157 | socket_address:
158 | protocol: UDP
159 | address: 0.0.0.0
160 | port_value: 5300
161 | udp_listener_config:
162 | downstream_socket_config:
163 | max_rx_datagram_size: 9000
164 | listener_filters:
165 | - name: envoy.filters.udp_listener.udp_proxy
166 | typed_config:
167 | '@type': type.googleapis.com/envoy.extensions.filters.udp.udp_proxy.v3.UdpProxyConfig
168 | stat_prefix: coredns
169 | matcher:
170 | on_no_match:
171 | action:
172 | name: route
173 | typed_config:
174 | '@type': type.googleapis.com/envoy.extensions.filters.udp.udp_proxy.v3.Route
175 | cluster: coredns
176 | - applyTo: CLUSTER
177 | match:
178 | context: GATEWAY
179 | patch:
180 | operation: ADD
181 | value:
182 | name: coredns
183 | type: STATIC
184 | lb_policy: ROUND_ROBIN
185 | load_assignment:
186 | cluster_name: coredns
187 | endpoints:
188 | - lb_endpoints:
189 | - endpoint:
190 | address:
191 | socket_address:
192 | address: 10.244.0.20
193 | port_value: 53
194 | ```
195 |
196 | 此时通过 network-tool 中的 dig 命令访问 Ingress Gateway 的 5300 端口,可以查询到 foo.bar.com 的 地址,说明 UDP Listener 创建成功。
197 |
198 | ```bash
199 | ➜ ~ kubectl exec network-tool -- dig @10.244.0.14 -p 5300 foo.bar.com
200 |
201 | ; <<>> DiG 9.18.1-1ubuntu1.3-Ubuntu <<>> @10.244.0.14 -p 5300 foo.bar.com
202 | ; (1 server found)
203 | ;; global options: +cmd
204 | ;; Got answer:
205 | ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 32291
206 | ;; flags: qr aa rd; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 3
207 | ;; WARNING: recursion requested but not available
208 |
209 | ;; OPT PSEUDOSECTION:
210 | ; EDNS: version: 0, flags:; udp: 1232
211 | ; COOKIE: c4486921ff737611 (echoed)
212 | ;; QUESTION SECTION:
213 | ;foo.bar.com. IN A
214 |
215 | ;; ADDITIONAL SECTION:
216 | foo.bar.com. 0 IN A 10.244.0.14
217 | _udp.foo.bar.com. 0 IN SRV 0 0 32875 .
218 |
219 | ;; Query time: 1 msec
220 | ;; SERVER: 10.244.0.14#5300(10.244.0.14) (UDP)
221 | ;; WHEN: Tue Apr 11 02:51:43 UTC 2023
222 | ;; MSG SIZE rcvd: 114
223 | ```
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/internal-redirect/_index.md:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | title: "Sidecar 初始化完成后再启动应用程序"
4 | linkTitle: "Sidecar 初始化完成后再启动应用程序"
5 | weight: 3
6 | date: 2022-07-06
7 | description:
8 | ---
9 |
10 | ## Envoy 内部重定向
11 |
12 | Envoy 支持在内部处理 3xx 重定向,捕获可配置的 3xx 重定向响应,合成一个新的请求,将其发送给新路由匹配指定的上游,将重定向的响应作为对原始请求的响应返回。原始请求的 header 和 body 将会发送至新位置。Trailers 尚不支持。
13 |
14 | 内部重定向可以使用路由配置中的 [internal_redirect_policy](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-routeaction-internal-redirect-policy) 字段来配置。 当重定向处理开启,任何来自上游的 3xx 响应,只要匹配到配置的 [redirect_response_codes](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-internalredirectpolicy-redirect-response-codes) 的响应都将由 Envoy 来处理。
15 |
16 | 如果 Envoy 内部重定向配置了 303 并且接收到了 303 响应,如果原始请求不是 GET 或者 HEAD,Envoy 将使用没有 body 的 GET 处理重定向。如果原始请求是 GET 或者 HEAD,Envoy 将使用原始的 HTTP Method 处理重定向。更多信息请查看 [RFC 7231 Section 6.4.4](https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4) 。
17 |
18 | 要成功地处理重定向,必须通过以下检查:
19 |
20 | 1. 响应码匹配到配置的 [redirect_response_codes](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-internalredirectpolicy-redirect-response-codes) ,默认是 302, 或者其他的 3xx 状态码(301, 302, 303, 307, 308)。
21 | 2. 拥有一个有效的、完全限定的 URL 的 location 头。
22 | 3. 该请求必须已被 Envoy 完全处理。
23 | 4. 请求必须小于 [per_request_buffer_limit_bytes](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-route-per-request-buffer-limit-bytes) 的限制。
24 | 5. [allow_cross_scheme_redirect](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-internalredirectpolicy-allow-cross-scheme-redirect) 是 true(默认是 false), 或者下游请求的 scheme 和 location 头一致。
25 | 6. 给定的下游请求之前处理的内部重定向次数不超过请求或重定向请求命中的路由配置的 [max_internal_redirects](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-internalredirectpolicy-max-internal-redirects) 。
26 | 7. 所有 [predicates](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-internalredirectpolicy-predicates) 都接受目标路由。
27 |
28 | 任何失败都将导致重定向传递给下游。
29 |
30 | 由于重定向请求可能会在不同的路由之间传递,重定向链中的任何满足以下条件的路由都将导致重定向被传递给下游。
31 |
32 | 1. 没有启用内部重定向
33 | 2. 或者当重定向链命中的路由的 [max_internal_redirects](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-internalredirectpolicy-max-internal-redirects) 小于等于重定向链的长度。
34 | 3. 或者路由被 [predicates](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-internalredirectpolicy-predicates) 拒绝。
35 |
36 | [previous_routes](https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/internal_redirect/previous_routes/v3/previous_routes_config.proto#envoy-v3-api-msg-extensions-internal-redirect-previous-routes-v3-previousroutesconfig) 和 [allow_listed_routes](https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/internal_redirect/allow_listed_routes/v3/allow_listed_routes_config.proto#envoy-v3-api-msg-extensions-internal-redirect-allow-listed-routes-v3-allowlistedroutesconfig) 这两个 predicates 可以创建一个有向无环图 (DAG) 来定义一个过滤器链,具体来说,allow_listed_routes 定义的有向无环图(DAG)中各个节点的边,而 previous_routes 定义了边的“访问”状态,因此如果需要就可以避免循环。
37 |
38 | 第三个 predicate [safe_cross_scheme](https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/internal_redirect/safe_cross_scheme/v3/safe_cross_scheme_config.proto#envoy-v3-api-msg-extensions-internal-redirect-safe-cross-scheme-v3-safecrossschemeconfig) 被用来阻止 HTTP -> HTTPS 的重定向。
39 |
40 | 一旦重定向通过这些检查,发送到原始上游的请求头将被修改为:
41 |
42 | - 将完全限定的原始请求 URL 放到 x-envoy-original-url 头中。
43 | - 使用 Location 头中的值替换 Authority/Host、Scheme、Path 头。
44 |
45 | 修改后的请求头将选择一个新的路由,通过一个新的过滤器链发送,然后把所有正常的 Envoy 请求都发送到上游进行清理。
46 |
47 | 请注意,HTTP 连接管理器头清理(例如清除不受信任的标头)仅应用一次。即使原始路由和第二个路由相同,每个路由的头修改也将同时应用于原始路由和第二路由,因此请谨慎配置头修改规则, 以避免重复不必要的请求头值。
48 |
49 |
50 | 一个简单的重定向流如下所示:
51 |
52 | 1. 客户端发送 GET 请求以获取 http://foo.com/bar
53 | 2. 上游 1 发送 302 响应码并携带 “location: http://baz.com/eep”
54 | 3. Envoy 被配置为允许原始路由上重定向,并发送新的 GET 请求到上游 2,携带请求头 “x-envoy-original-url: http://foo.com/bar” 获取 http://baz.com/eep
55 | 4. Envoy 将 http://baz.com/eep 的响应数据代理到客户端,作为对原始请求的响应。
56 |
57 | ## 在 Isito 中通过 Envoyfilter 开启内部重定向
58 |
59 | ```yaml
60 | apiVersion: networking.istio.io/v1alpha3
61 | kind: EnvoyFilter
62 | metadata:
63 | name: follow-redirects
64 | namespace: istio-system
65 | spec:
66 | workloadSelector:
67 | labels:
68 | app: istio-ingressgateway
69 | configPatches:
70 | - applyTo: HTTP_ROUTE
71 | match:
72 | context: ANY
73 | patch:
74 | operation: MERGE
75 | value:
76 | route:
77 | internal_redirect_policy:
78 | max_internal_redirects: 5
79 | redirect_response_codes: ["302"]
80 | ```
81 |
82 | ## 测试
83 |
84 | 开启前
85 |
86 | ```bash
87 | curl -i '172.16.0.2/redirect-to?url=http://172.16.0.2/status/200'
88 |
89 | HTTP/1.1 302 Found
90 | server: istio-envoy
91 | date: Fri, 11 Mar 2022 07:20:38 GMT
92 | content-type: text/html; charset=utf-8
93 | content-length: 0
94 | location: http://172.16.0.2/status/200
95 | access-control-allow-origin: *
96 | access-control-allow-credentials: true
97 | x-envoy-upstream-service-time: 1
98 | ```
99 |
100 | 开启后
101 |
102 | ```bash
103 | curl -i '172.16.0.2/redirect-to?url=http://172.16.0.2/status/200'
104 |
105 | HTTP/1.1 200 OK
106 | server: istio-envoy
107 | date: Fri, 11 Mar 2022 07:21:03 GMT
108 | content-type: text/html; charset=utf-8
109 | access-control-allow-origin: *
110 | access-control-allow-credentials: true
111 | content-length: 0
112 | x-envoy-upstream-service-time: 0
113 | ```
114 |
115 | 注意 location 需返回完整 URL,下面这种情况不会触发内部重定向
116 |
117 | ```bash
118 | curl -i '172.16.0.2/status/302'
119 |
120 | HTTP/1.1 302 Found
121 | server: istio-envoy
122 | date: Fri, 11 Mar 2022 07:30:38 GMT
123 | location: /redirect/1
124 | access-control-allow-origin: *
125 | access-control-allow-credentials: true
126 | content-length: 0
127 | x-envoy-upstream-service-time: 1
128 | ```
129 |
130 | ## 参考资料
131 | * https://www.envoyproxy.io/docs/envoy/latest/intro/arch_overview/http/http_connection_management#internal-redirects
132 | * https://cloudnative.to/blog/envoy-http-connection-management/
133 | * https://github.com/istio/istio/issues/32673
134 |
--------------------------------------------------------------------------------
/content/zh/docs/debug-istio/envoy-log/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Envoy 日志分析"
3 | linkTitle: ""
4 | weight: 1
5 | date: 2022-07-06
6 | description:
7 | ---
8 |
9 |
10 | ## 1. 问题背景
11 |
12 | 这是使用 Istio 最常见的困境:在微服务中引入 Envoy 作为代理后,当流量访问和预期行为不符时,用户很难快速确定问题是出在哪个环节。客户端收到的异常响应,诸如 403、404、503 或者连接中断等,可能是链路中任一 Sidecar 执行流量管控的结果, 但也有可能是来自某个服务的合理逻辑响应。
13 |
14 | 特别的,当 Service Mesh 系统的维护者和应用程序的开发者来自不同的团队时,问题尤为凸显。
15 |
16 | 在 Mesh 中引入全链路跟踪系统,可以解决部分问题,我们可以知道请求到达了哪些工作负载,但是对于中断的异常请求,我们仍然很难确定原因。 因为本着最大透明化(Maximize Transparency)的设计目标,Istio 的遥测系统会尽量屏蔽掉 Sidecar 的存在。另一方面,用户自行维护一套全链路跟踪系统成本也很高,受限于遥测采样率和有限的协议支持,我们通常无法采集所有链路数据。
17 |
18 | 幸运的是,Envoy 本身可以记录流量的信息,本文主要介绍如何利用 Envoy 日志,对类似问题进行定位。
19 |
20 | ---
21 |
22 | ## 2. Envoy 流量模型
23 |
24 | 我们先看看 Envoy 的流量模型:
25 |
26 | 1. 监听,接受连接
27 | 2. 根据用户流量操纵规则,进行流量特征识别
28 | 3. 进行流量操纵,如负载均衡,转发,拒绝等
29 |
30 | 在以上流程中, Envoy 接受请求流量叫做 **Downstream**,Envoy 发出请求流量叫做 **Upstream**。在处理 Downstream 和 Upstream 过程中, 分别会涉及 2 个流量端点,即请求的发起端和接收端:
31 |
32 | 
33 |
34 | 在这个过程中, Envoy 会根据用户规则,计算出符合条件的转发目的主机集合,这个集合叫做 **UPSTREAM_CLUSTER**, 并根据负载均衡规则,从这个集合中选择一个 host 作为流量转发的接收端点,这个 host 就是 **UPSTREAM_HOST**。
35 |
36 | 以上就是 Envoy 请求处理的 **流量五元组信息**, 这是 Envoy 日志里最重要的部分,通过这个五元组我们可以准确的观测流量「从哪里来」和「到哪里去」。
37 |
38 | * UPSTREAM_CLUSTER
39 | * DOWNSTREAM_REMOTE_ADDRESS
40 | * DOWNSTREAM_LOCAL_ADDRESS
41 | * UPSTREAM_LOCAL_ADDRESS
42 | * UPSTREAM_HOST
43 |
44 | ---
45 |
46 | ## 3. Helloworld example
47 |
48 | 在 Istio 场景中,Envoy 既可以是正向代理,也可以是反向代理。在上图中, 如果 Envoy 处理的是 Outbound 流量, 业务容器是作为 Downstream 端点(右边);如果 Envoy 处理的是 Inbound 流量, 业务容器是作为 Upstream 端点(左边)。
49 |
50 | Istio 中默认不开启 Envoy 中的访问日志,需要手动打开,将 Istio 配置中 `accessLogFile` 设置为 `/dev/stdout`:
51 |
52 | ```yaml
53 | % kubectl -n istio-system edit cm istio
54 | ......
55 | # Set accessLogFile to empty string to disable access log.
56 | accessLogFile: "/dev/stdout" # 开启日志
57 |
58 | accessLogEncoding: 'JSON' # 默认日志是单行格式, 可选设置为 JSON
59 | ......
60 | ```
61 |
62 | 我们以 sleep Pod 访问 hello 服务来举例说明:
63 |
64 | ```shell
65 | kubectl apply -f sleep-hello.yaml
66 | ```
67 |
68 | 
69 |
70 | 该文件定义了 2 个版本的 helloworld 和一个 sleep Pod,helloworld Service 的端口是 4000, 而 Pod 的端口是 5000。
71 |
72 | 从 sleep Pod 中去访问 helloworld 服务, 确认应用正常:
73 |
74 | ```shell
75 | % SLEEP_POD=$(kubectl get pod -l app=sleep -o jsonpath="{.items[0].metadata.name}")
76 | % HELLO_V1_POD=$(kubectl get pod -l app=helloworld -l version=v1 -o jsonpath="{.items[0].metadata.name}")
77 | % kubectl exec -it $SLEEP_POD -csleep -- sh
78 | / # curl helloworld:4000/hello
79 | ```
80 |
81 | 这时候我们可以去分析 2 个 Pod 各自的 Envoy 日志:
82 |
83 | 
84 |
85 | 用一张图来说明:
86 |
87 | 
88 |
89 | 从日志中我们可以分析出:
90 |
91 | 对于 sleep Pod, sleep app 发出的流量目的端是 hello Service ip 和 Service port,sleep Envoy 处理的是 Outbound 流量, Envoy 根据规则选择的 「UPSTREAM_CLUSTER」是 `outbound|4000||helloworld.default.svc.cluster.local `, 然后转发给其中的一个 「UPSTREAM_HOST」, 也就是 hello Pod 的 ip 和 port。
92 |
93 | 对于 hello Pod,其 Envoy 处理的是 Inbound 流量,Envoy 根据规则选择的 「UPSTREAM_CLUSTER」 是 `inbound|4000|http|helloworld.default.svc.cluster.local `, 其中的 「UPSTREAM_HOST」 是 「127.0.0.1:5000」, 也就是该 Pod 里的 hello app。
94 |
95 | 因此,我们可以总结出 Istio 中流量端点值的逻辑规则:
96 |
97 | #### UPSTREAM_HOST
98 |
99 | 上游主机的 host,表示从 Envoy 发出的请求的目的端,通常是「ip:port」
100 |
101 | 通常来说,对于 Outbound Cluster,此值是「上游 pod-ip : pod-port」 ,而对于 Inbound Cluster,此值是「127.0.0.1 : pod-port」
102 |
103 | #### UPSTREAM_LOCAL_ADDRESS
104 |
105 | 上游连接中,当前 Envoy 的本地地址,此值是「当前 pod-ip : 随机端口」
106 |
107 | #### DOWNSTREAM_LOCAL_ADDRESS
108 |
109 | 下游连接中,当前 Envoy 的本地地址。
110 |
111 | 通常来说,对于 Outbound Cluster,此值是「目的 service-ip : service-port 」,而对于 Inbound Cluster,此值是「当前 pod-ip : pod-port」
112 |
113 | #### DOWNSTREAM_REMOTE_ADDRESS
114 |
115 | 下游连接中远端地址。
116 |
117 | 通常来说,对于 Outbound Cluster,此值是「当前 pod-ip : 随机端口 」,而对于 Inbound Cluster,此值是「下游 pod-ip : 随机端口」
118 |
119 | ---
120 |
121 | ## 4. Envoy 日志格式
122 |
123 | Envoy 允许定制日志格式, 格式通过若干「Command Operators」组合,用于提取请求信息,Istio 没有使用 Envoy 默认的日志格式, Istio 定制的访问日志格式如下:
124 |
125 | 
126 |
127 | 完整的「Command Operators」含义可查阅 [Envoy Access logging Command Operators](https://www.envoyproxy.io/docs/envoy/latest/configuration/observability/access_log/usage#command-operators)
128 |
129 | 除了以上流量五元组,流量分析中常用的重要信息还有:
130 |
131 | #### RESPONSE_CODE
132 |
133 | 响应状态码
134 |
135 | #### RESPONSE_FLAGS
136 |
137 | 很重要的信息,Envoy 中自定义的响应标志位, 可以认为是 Envoy 附加的流量状态码。
138 |
139 | 如「NR」表示找不到路由,「UH」表示 Upstream Cluster 中没有健康的 host,「RL」表示触发 rate limit,「UO」触发断路器。
140 |
141 | `RESPONSE_FLAGS` 可选值有十几个,这些信息在调试中非常关键。
142 |
143 | #### X-REQUEST-ID
144 |
145 | 一次 C 到 S 的 http 请求,Envoy 会在 C 端生产 request id,并附加到 header 中,传递到 S 端,在 2 端的日志中都会记录该值, 因此可以通过这个 ID 关联请求的上下游。注意不要和全链路跟踪中的 trace id 混淆。
146 |
147 | #### ROUTE_NAME
148 |
149 | 匹配执行的路由名称
150 |
151 | ---
152 |
153 | ## 5. 场景:判断异常返回是来自业务还是 Sidecar?
154 |
155 | 比如我们希望所有请求 helloworld 都路由到 v1 版本,创建对应的 VirtualService:
156 |
157 | ```shell
158 | % kubectl apply -f hello-v1-virtualservice.yaml
159 | ```
160 |
161 | ```yaml
162 | apiVersion: networking.istio.io/v1alpha3
163 | kind: VirtualService
164 | metadata:
165 | name: hello
166 | spec:
167 | hosts:
168 | - "helloworld"
169 | http:
170 | - route:
171 | - destination:
172 | host: helloworld
173 | subset: v1
174 | port:
175 | number: 4000
176 | ```
177 |
178 | 从 sleep 中访问发现响应 503:
179 |
180 | 
181 |
182 |
183 | 如果没有上下文,我们很难判断 503 是来自业务容器还是 Sidecar,查看 sleep 和 hello 的 Envoy 日志,可以发现:hello Pod 的 Envoy 没有接受到请求,sleep Pod 的 Envoy 里日志:
184 |
185 | 
186 |
187 | 其中 `"response_flags": "NR"` 表示「No route configured」,也就是 Envoy 找不到路由,我们可以判断出该异常是有 Envoy 返回。
188 |
189 | 通过简单的分析就可以找到原因, 我们在 VirtualService 中使用的 Destination 没有定义,将其补上:
190 |
191 | ```shell
192 | % kubectl apply -f hello-v1-destinationrule.yaml
193 | ```
194 |
195 | ```yaml
196 | apiVersion: networking.istio.io/v1alpha3
197 | kind: DestinationRule
198 | metadata:
199 | name: hello
200 | spec:
201 | host: helloworld
202 | subsets:
203 | - name: v1
204 | labels:
205 | version: v1
206 | ```
207 |
208 | 再次访问请求正常,日志中 `response_flags` 为空:
209 |
210 | 
211 |
212 | ---
213 |
214 | ## 6. 开启 debug 模式
215 |
216 | Envoy 默认日志级别是 info,其日志内容能满足大部分调试场景需求,但对于比较复杂的异常,我们往往还需要开启 debug 级别,能获取到更多的流量处理过程和信息,对某个特定的 Pod,调整日志级别为 debug 的命令:
217 |
218 | ```
219 | kubectl exec {POD-NAME} -c istio-proxy -- curl -X POST http://127.0.0.1:15000/logging?level=debug
220 | ```
--------------------------------------------------------------------------------
/content/zh/docs/common-problem/tcp-keepalive/_index.md:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | title: "长连接未开启 tcp keepalive"
4 | linkTitle: "长连接未开启 tcp keepalive"
5 | weight: 5
6 | date: 2022-07-06
7 | description:
8 | ---
9 |
10 | ## 故障现象
11 | 用户反馈链路偶发 500 错误,频率低但是持续存在。
12 |
13 | 用户访问链路较长,核心链路简化如下:
14 |
15 | ```
16 | 1. client ->
17 | 2. [istio ingress gateway] ->
18 | 3. podA[app->sidecar] ->
19 | 4. 腾讯云内网CLB ->
20 | 5. [istio ingress gateway] ->
21 | 6. podB[sidecar->app]
22 | ```
23 |
24 | 应用对外是 https 服务,证书在 istio ingress gateway 上处理。
25 |
26 | ## 故障分析
27 |
28 | 通过分析链路中 sidecar accesslog 日志,有以下现象:
29 |
30 | 1. 第 3 跳 podA 正常发出请求,但接收到 500 返回。
31 | 2. 第 5 跳 istio ingress gateway 没有该 500 对应的访问日志。
32 |
33 | 因此重点分析 第 3,4,5 跳。
34 |
35 |
36 | 在第 3 跳 podA 上抓到 500 对应的数据包:
37 |
38 |
39 | 
40 |
41 |
42 | 抓包显示,podA 向一个已经断开的连接发送数据包,收到 RST 因此返回 500,但抓包并没有发现这个连接之前有主动断开的行为(FIN)。
43 |
44 | 登录 podA,查看连接情况:
45 |
46 |
47 | 
48 |
49 | ss 显示用户代码里使用了 tcp 长连接,注意这里我们使用了 ss 参数 `-o`, 该参数可以显示 tcp keepalive timer 信息:
50 |
51 | ```
52 | -o, --options
53 | Show timer information. For TCP protocol, the output
54 | format is:
55 |
56 | timer:(,,)
57 |
58 |
59 | the name of the timer, there are five kind of timer
60 | names:
61 |
62 | on : means one of these timers: TCP retrans timer,
63 | TCP early retrans timer and tail loss probe timer
64 |
65 | keepalive: tcp keep alive timer
66 |
67 | timewait: timewait stage timer
68 |
69 | persist: zero window probe timer
70 |
71 | unknown: none of the above timers
72 |
73 |
74 | how long time the timer will expire
75 |
76 |
77 | how many times the retransmission occurred
78 | ```
79 |
80 | 但从 ss 结果并未看到 timer 信息,推断 podA 使用的长连接并未开启 keepalive。
81 |
82 |
83 | ## 故障原因
84 |
85 | podA 使用了 tcp 长连接,但是没有开启 keepalive,当长连接出现一段时间空闲,该连接可能被网络中间组件释放,比如 client、server 端的母机, 但 client 端还是持有断开连接,后续重用该链接就会导致上述异常。
86 |
87 | ## 解决方案
88 |
89 | 问题本质是因为长连接 idle 过长,且缺乏探活机制,导致 client 没感知到连接已释放,尝试三种方案:
90 |
91 | 1. 应用代码修复
92 | 2. istio 方案:client sidecar 开启 keepalive
93 | 3. istio 方案:server 开启 keepalive
94 |
95 | ### 应用代码修复
96 |
97 | 最直接的方案是应用在使用长连接时,开启 tcp keepalive,以 golang 程序示例,我们尝试用长连接访问 https://www.baidu.com
98 |
99 | 先模拟使用长连接但不开启 keepalive:
100 |
101 | ```golang
102 | var HTTPTransport = &http.Transport{
103 | DialContext: (&net.Dialer{
104 | Timeout: 10 * time.Second,
105 | KeepAlive: -1 * time.Second, // disable TCP KeepAlive
106 | }).DialContext,
107 | MaxIdleConns: 50,
108 | IdleConnTimeout: 60 * time.Second,
109 | MaxIdleConnsPerHost: 20,
110 | }
111 |
112 | func main() {
113 | uri := "https://www.baidu.com"
114 | times := 200
115 |
116 | client := http.Client{Transport: HTTPTransport}
117 | for i := 0; i < times; i++ {
118 | time.Sleep(2 * time.Second)
119 | req, err := http.NewRequest(http.MethodGet, uri, nil)
120 | if err != nil {
121 | fmt.Println("NewRequest Failed " + err.Error())
122 | continue
123 | }
124 | resp, err := client.Do(req)
125 | if err != nil {
126 | fmt.Println("Http Request Failed " + err.Error())
127 | continue
128 | }
129 | fmt.Println(resp.Status)
130 | ioutil.ReadAll(resp.Body)
131 | resp.Body.Close()
132 | }
133 | ```
134 |
135 | 注意 `KeepAlive: -1` 表示禁用了 tcp keepalive 探活,ss 查看:
136 |
137 | 
138 |
139 | 结果显示长连接缺乏 timer。注意测试 pod 在 istio 环境,上述第一个连接是 go 程序到 envoy,第二个连接是 envoy 到 baidu。
140 |
141 | golang 代码修复方案很简单,只需要把 `KeepAlive` 设置为非负数, 代码修改
142 |
143 | ```golang
144 | var HTTPTransport = &http.Transport{
145 | DialContext: (&net.Dialer{
146 | Timeout: 10 * time.Second,
147 | KeepAlive: 120 * time.Second, // keepalive 设置为 2 分钟
148 | }).DialContext,
149 | MaxIdleConns: 50,
150 | IdleConnTimeout: 60 * time.Second,
151 | MaxIdleConnsPerHost: 20,
152 | }
153 | ```
154 |
155 | ss 查看连接情况:
156 |
157 | 
158 |
159 | ss 显示 go client 到 envoy 开启了 keepalive,问题解决。
160 |
161 | **但用户应用程序较多,不方便逐一调整 keepalive,希望通过 istio sidecar 来解决上述问题**。keepalive 可以在 client、server 任意一端开启,以下是使用 istio 的两种方案:
162 |
163 |
164 | ### istio 方案:client sidecar 开启 keepalive
165 |
166 | 该方案需要client 注入 istio sidecar,仍以访问 baidu https 为例,外部服务在 istio 中默认转发到 PassthroughCluster, 要对指定外部服务流量进行流控,我们需要先给该服务创建一个 service entry:
167 |
168 | ```yaml
169 | apiVersion: networking.istio.io/v1alpha3
170 | kind: ServiceEntry
171 | metadata:
172 | name: baidu-https
173 | spec:
174 | hosts:
175 | - www.baidu.com
176 | location: MESH_EXTERNAL
177 | ports:
178 | - number: 443
179 | name: https
180 | protocol: TLS
181 | ```
182 |
183 | 然后增加 tcp keepalive 设置:
184 |
185 | ```yaml
186 | apiVersion: networking.istio.io/v1alpha3
187 | kind: DestinationRule
188 | metadata:
189 | name: baidu-https
190 | spec:
191 | host: www.baidu.com
192 | trafficPolicy:
193 | connectionPool:
194 | tcp:
195 | maxConnections: 100
196 | tcpKeepalive:
197 | time: 600s
198 | interval: 75s
199 | probes: 9
200 | ```
201 |
202 | 
203 |
204 | ss 显示 go client 到 envoy 并没有 keepalive, 但 envoy 到 baidu 开启了 keepalive。
205 |
206 |
207 | ### istio 方案:server 开启 keepalive
208 |
209 | 用户异常链路的 server 入口 是 CLB 后端的 ingress gateway,在 ingress gateway 上开启 keepalive 会稍微复杂一点,需要使用 envoyfilter 来设置 socekt options:
210 |
211 | ```
212 | apiVersion: networking.istio.io/v1alpha3
213 | kind: EnvoyFilter
214 | metadata:
215 | name: ingress-gateway-socket-options
216 | namespace: istio-system
217 | spec:
218 | configPatches:
219 | - applyTo: LISTENER
220 | match:
221 | context: GATEWAY
222 | listener:
223 | name: 0.0.0.0_443
224 | portNumber: 443
225 | patch:
226 | operation: MERGE
227 | value:
228 | socket_options:
229 | - int_value: 1
230 | level: 1 # SOL_SOCKET
231 | name: 9 # SO_KEEPALIVE
232 | state: STATE_PREBIND
233 | - int_value: 9
234 | level: 6 # IPPROTO_TCP
235 | name: 6 # TCP_KEEPCNT
236 | state: STATE_PREBIND
237 | - int_value: 600
238 | level: 6 # IPPROTO_TCP
239 | name: 4 # TCP_KEEPIDLE
240 | state: STATE_PREBIND
241 | - int_value: 75
242 | level: 6 # IPPROTO_TCP
243 | name: 5 # TCP_KEEPINTVL
244 | state: STATE_PREBIND
245 | ```
246 |
247 | 上述配置的含义是:对于 433 LDS,tcp 连接设置 socket options:连接空闲 600s 后,开始发送探活 probe;如果探活失败,会持续探测 9 次,探测间隔为 75 s。
248 |
249 | 在 ingress gateway 上 ss, 显示 443 上连接都开启了 keepalive:
250 |
251 | 
252 |
253 | 如果用户 client 较多不便调整,更适合在 server (ingress gateway)开启 keepalive。另外该方案对 client 有无 sidecar 没有要求。
254 |
255 | ## 总结
256 |
257 | 使用长连接时,应用需要设置合理的 keepalive 参数,特别是对于访问频次较低的场景,以及链路较长的情况。
258 |
259 | **istio 无入侵式的流量操纵能力,可以很方便的对流量行为进行调优,这也是用户选择 istio 的重要原因。**
260 |
261 | ---
262 |
263 | ## 参考资料
264 |
265 | * https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/core/v3/socket_option.proto
266 | * https://github.com/envoyproxy/envoy/issues/3634
267 | * https://github.com/istio/istio/issues/28879
--------------------------------------------------------------------------------
/config.toml:
--------------------------------------------------------------------------------
1 | #baseURL = "/"
2 | baseURL="https://www.zhaohuabing.com/istio-guide/"
3 | title = "Istio 运维实战"
4 |
5 | # Language settings
6 | contentDir = "content"
7 | defaultContentLanguage = "zh"
8 | defaultContentLanguageInSubdir = false
9 | # Useful when translating.
10 | enableMissingTranslationPlaceholders = true
11 |
12 | enableRobotsTXT = true
13 |
14 | # Will give values to .Lastmod etc.
15 | enableGitInfo = true
16 |
17 | # Comment out to enable taxonomies in Docsy
18 | # disableKinds = ["taxonomy", "taxonomyTerm"]
19 |
20 | # You can add your own taxonomies
21 | [taxonomies]
22 | tag = "tags"
23 | category = "categories"
24 |
25 | [params.taxonomy]
26 | # set taxonomyCloud = [] to hide taxonomy clouds
27 | taxonomyCloud = ["tags", "categories"]
28 |
29 | # If used, must have same length as taxonomyCloud
30 | taxonomyCloudTitle = ["Tag Cloud", "Categories"]
31 |
32 | # set taxonomyPageHeader = [] to hide taxonomies on the page headers
33 | taxonomyPageHeader = ["tags", "categories"]
34 |
35 |
36 | # Highlighting config
37 | pygmentsCodeFences = true
38 | pygmentsUseClasses = false
39 | # Use the new Chroma Go highlighter in Hugo.
40 | pygmentsUseClassic = false
41 | #pygmentsOptions = "linenos=table"
42 | # See https://help.farbox.com/pygments.html
43 | pygmentsStyle = "tango"
44 |
45 | # Configure how URLs look like per section.
46 | [permalinks]
47 | blog = "/:section/:year/:month/:day/:slug/"
48 |
49 | ## Configuration for BlackFriday markdown parser: https://github.com/russross/blackfriday
50 | [blackfriday]
51 | plainIDAnchors = true
52 | hrefTargetBlank = true
53 | angledQuotes = false
54 | latexDashes = true
55 |
56 | # Image processing configuration.
57 | [imaging]
58 | resampleFilter = "CatmullRom"
59 | quality = 75
60 | anchor = "smart"
61 |
62 | [services]
63 | [services.googleAnalytics]
64 | # Comment out the next line to disable GA tracking. Also disables the feature described in [params.ui.feedback].
65 | id = "G-Q13FPWS6LW"
66 |
67 | # Language configuration
68 |
69 | [languages.zh]
70 | title = "Istio 运维实战"
71 | description = "Istio 运维实战"
72 | languageName ="中文"
73 | contentDir = "content/zh"
74 | time_format_default = "2006.02.01"
75 | time_format_blog = "2006.02.01"
76 |
77 | [markup]
78 | [markup.goldmark]
79 | [markup.goldmark.renderer]
80 | unsafe = true
81 | [markup.highlight]
82 | # See a complete list of available styles at https://xyproto.github.io/splash/docs/all.html
83 | style = "tango"
84 | # Uncomment if you want your chosen highlight style used for code blocks without a specified language
85 | # guessSyntax = "true"
86 |
87 | # Everything below this are Site Params
88 |
89 | # Comment out if you don't want the "print entire section" link enabled.
90 | [outputs]
91 | section = ["HTML", "print", "RSS"]
92 |
93 | [params]
94 | copyright = "Huabing Blog"
95 | privacy_policy = "https://www.zhaohuabing.com/"
96 |
97 | # First one is picked as the Twitter card image if not set on page.
98 | # images = ["images/project-illustration.png"]
99 |
100 | # Menu title if your navbar has a versions selector to access old versions of your site.
101 | # This menu appears only if you have at least one [params.versions] set.
102 | version_menu = "Releases"
103 |
104 | # Flag used in the "version-banner" partial to decide whether to display a
105 | # banner on every page indicating that this is an archived version of the docs.
106 | # Set this flag to "true" if you want to display the banner.
107 | archived_version = false
108 |
109 | # The version number for the version of the docs represented in this doc set.
110 | # Used in the "version-banner" partial to display a version number for the
111 | # current doc set.
112 | version = "0.0"
113 |
114 | # A link to latest version of the docs. Used in the "version-banner" partial to
115 | # point people to the main doc site.
116 | url_latest_version = "https://zhaohuabing.com/istio-guide"
117 |
118 | # Repository configuration (URLs for in-page links to opening issues and suggesting changes)
119 | github_repo = "https://github.com/zhaohuabing/istio-guide"
120 | # An optional link to a related project repo. For example, the sibling repository where your product code lives.
121 | #github_project_repo = "https://github.com/google/docsy"
122 |
123 | # Specify a value here if your content directory is not in your repo's root directory
124 | # github_subdir = ""
125 |
126 | # Uncomment this if your GitHub repo does not have "main" as the default branch,
127 | # or specify a new value if you want to reference another branch in your GitHub links
128 | github_branch= "master"
129 |
130 | # Google Custom Search Engine ID. Remove or comment out to disable search.
131 | gcs_engine_id = "d72aa9b2712488cc3"
132 |
133 | # Enable Algolia DocSearch
134 | algolia_docsearch = false
135 |
136 | # Enable Lunr.js offline search
137 | offlineSearch = false
138 |
139 | # Enable syntax highlighting and copy buttons on code blocks with Prism
140 | prism_syntax_highlighting = false
141 |
142 | # User interface configuration
143 | [params.ui]
144 | # Set to true to disable breadcrumb navigation.
145 | breadcrumb_disable = false
146 | # Set to true to disable the About link in the site footer
147 | footer_about_disable = true
148 | # Set to false if you don't want to display a logo (/assets/icons/logo.svg) in the top navbar
149 | navbar_logo = true
150 | # Set to true if you don't want the top navbar to be translucent when over a `block/cover`, like on the homepage.
151 | navbar_translucent_over_cover_disable = false
152 | # Enable to show the side bar menu in its compact state.
153 | sidebar_menu_compact = true
154 | ul_show = 1
155 | sidebar_menu_foldable = true
156 | sidebar_cache_limit = 100
157 | # Set to true to hide the sidebar search box (the top nav search box will still be displayed if search is enabled)
158 | sidebar_search_disable = false
159 |
160 | # Adds a H2 section titled "Feedback" to the bottom of each doc. The responses are sent to Google Analytics as events.
161 | # This feature depends on [services.googleAnalytics] and will be disabled if "services.googleAnalytics.id" is not set.
162 | # If you want this feature, but occasionally need to remove the "Feedback" section from a single page,
163 | # add "hide_feedback: true" to the page's front matter.
164 | [params.ui.feedback]
165 | enable = false
166 | # The responses that the user sees after clicking "yes" (the page was helpful) or "no" (the page was not helpful).
167 | yes = 'Glad to hear it! Please tell us how we can improve.'
168 | no = 'Sorry to hear that. Please tell us how we can improve.'
169 |
170 | # Adds a reading time to the top of each doc.
171 | # If you want this feature, but occasionally need to remove the Reading time from a single page,
172 | # add "hide_readingtime: true" to the page's front matter
173 | [params.ui.readingtime]
174 | enable = false
175 |
176 | [params.links]
177 | # End user relevant links. These will show up on left side of footer and in the community page if you have one.
178 | [[params.links.user]]
179 | name = "Email"
180 | url = "mailto:zhaohuabing@zhaohuabing.com"
181 | icon = "fa fa-envelope"
182 | desc = "Discussion and help from your fellow users"
183 | [[params.links.user]]
184 | name ="Twitter"
185 | url = "https://twitter.com/zhaohuabing"
186 | icon = "fab fa-twitter"
187 | desc = "Follow us on Twitter to get the latest news!"
188 | # Developer relevant links. These will show up on right side of footer and in the community page if you have one.
189 | [[params.links.developer]]
190 | name = "GitHub"
191 | url = "https://github.com/zhaohuabing/istio-guide"
192 | icon = "fab fa-github"
193 | desc = "Development takes place here!"
194 | [params.plantuml]
195 | enable = true
196 | theme = "default"
197 |
198 | #Set url to plantuml server
199 | #default is http://www.plantuml.com/plantuml/svg/
200 | svg_image_url = "https://www.plantuml.com/plantuml/svg/"
201 |
202 | # hugo module configuration
203 |
204 | [module]
205 | # uncomment line below for temporary local development of module
206 | # replacements = "github.com/google/docsy -> ../../docsy"
207 | [module.hugoVersion]
208 | extended = true
209 | min = "0.75.0"
210 | [[module.imports]]
211 | path = "github.com/google/docsy"
212 | disable = false
213 | [[module.imports]]
214 | path = "github.com/google/docsy/dependencies"
215 | disable = false
216 |
217 | [params.giscus]
218 | data_repo = "zhaohuabing/istio-guide"
219 | data_repo_id = "R_kgDOIPGBXA"
220 | data_category = "Announcements"
221 | data_category_id = "DIC_kwDOIPGBXM4CSWjp"
222 | data_mapping = "pathname"
223 | data_reactions_enabled = "1"
224 | data_emit_metadata = "0"
225 | data_theme = "light"
226 | data_lang = "en"
227 | crossorigin = "anonymous"
228 |
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/async-message-tracing/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "在 Istio 中实现异步消息调用跟踪"
3 | linkTitle: "在 Istio 中实现异步消息调用跟踪"
4 | weight: 5
5 | date: 2022-07-06
6 | description:
7 | ---
8 |
9 | 在实际项目中,除了同步调用之外,异步消息也是微服务架构中常见的一种通信方式。在本篇文章中,我将继续利用 eshop demo 程序来探讨如何通过 OpenTracing 将 Kafka 异步消息也纳入到 Istio 的分布式调用跟踪中。
10 |
11 | # eshop 示例程序结构
12 |
13 | 如下图所示,demo 程序中增加了发送和接收 Kafka 消息的代码。eshop 微服务在调用 inventory,billing,delivery 服务后,发送了一个 kafka 消息通知,consumer 接收到通知后调用 notification 服务的 REST 接口向用户发送购买成功的邮件通知。
14 | 
15 |
16 | # 将 Kafka 消息处理加入调用链跟踪
17 |
18 | ## 植入 Kafka OpenTracing 代码
19 | 首先从 github 下载代码。
20 |
21 | ```bash
22 | git clone git@github.com:aeraki-framework/method-level-tracing-with-istio.git
23 | ```
24 |
25 | 可以直接使用该代码,但建议跟随下面的步骤查看相关的代码,以了解各个步骤背后的原理。
26 |
27 | 根目录下分为了 rest-service 和 kafka-consumer 两个目录,rest-service 下包含了各个 REST 服务的代码,kafka-consumer 下是 Kafka 消息消费者的代码。
28 |
29 | 首先需要将 spring kafka 和 OpenTracing kafka 的依赖加入到两个目录下的 pom 文件中。
30 |
31 | ```xml
32 |
33 | org.springframework.kafka
34 | spring-kafka
35 |
36 |
37 | io.opentracing.contrib
38 | opentracing-kafka-client
39 | ${version.opentracing.kafka-client}
40 |
41 | ```
42 |
43 | 在 rest-service 目录中的 KafkaConfig.java 中配置消息 Producer 端的 OpenTracing Instrument。TracingProducerInterceptor 会在发送 Kafka 消息时生成发送端的 Span。
44 |
45 | ```java
46 | @Bean
47 | public ProducerFactory producerFactory() {
48 | Map configProps = new HashMap<>();
49 | configProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapAddress);
50 | configProps.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
51 | configProps.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
52 | configProps.put(ProducerConfig.INTERCEPTOR_CLASSES_CONFIG, TracingProducerInterceptor.class.getName());
53 | return new DefaultKafkaProducerFactory<>(configProps);
54 | }
55 | ```
56 |
57 | 在 kafka-consumer 目录中的 KafkaConfig.java 中配置消息 Consumer 端的 OpenTracing Instrument。TracingConsumerInterceptor 会在接收到 Kafka 消息是生成接收端的 Span。
58 |
59 | ```java
60 | @Bean
61 | public ConsumerFactory consumerFactory() {
62 | Map props = new HashMap<>();
63 | props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapAddress);
64 | props.put(ConsumerConfig.GROUP_ID_CONFIG, groupId);
65 | props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
66 | props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
67 | props.put(ConsumerConfig.INTERCEPTOR_CLASSES_CONFIG, TracingConsumerInterceptor.class.getName());
68 | return new DefaultKafkaConsumerFactory<>(props);
69 | }
70 | ```
71 | 只需要这两步即可完成 Spring 程序的 Kafka OpenTracing 代码植入。下面安装并运行示例程序查看效果。
72 |
73 | ## 安装 Kafka 集群
74 |
75 | 示例程序中使用到了 Kafka 消息,因此我们在 TKE 集群中部署一个简单的 Kafka 实例:
76 |
77 | ```bash
78 | cd method-level-tracing-with-istio
79 | kubectl apply -f k8s/kafka.yaml
80 | ```
81 |
82 | ## 部署 demo 应用
83 |
84 | 修改 Kubernetes yaml 部署文件 k8s/eshop.yaml,设置 Kafka bootstrap server,以用于 demo 程序连接到 Kafka 集群中。
85 |
86 | ```yml
87 | apiVersion: apps/v1
88 | kind: Deployment
89 | metadata:
90 | name: delivery
91 | ......
92 | spec:
93 | containers:
94 | - name: eshop
95 | image: aeraki/istio-opentracing-demo:latest
96 | ports:
97 | - containerPort: 8080
98 | env:
99 | ....
100 | //在这里加入 Kafka server 地址
101 | - name: KAFKA_BOOTSTRAP_SERVERS
102 | value: "kafka-service:9092"
103 |
104 | ---
105 | apiVersion: apps/v1
106 | kind: Deployment
107 | metadata:
108 | name: kafka-consumer
109 | ......
110 | spec:
111 | containers:
112 | - name: kafka-consumer
113 | image: aeraki/istio-opentracing-demo-kafka-consumer:latest
114 | env:
115 | ....
116 | //在这里加入 Kafka server 地址
117 | - name: KAFKA_BOOTSTRAP_SERVERS
118 | value: "kafka-service:9092"
119 | ```
120 |
121 | 然后部署应用程序,相关的镜像可以直接从 dockerhub 下载,也可以通过源码编译生成。
122 |
123 | ```bash
124 | kubectl apply -f k8s/eshop.yaml
125 | ```
126 |
127 | 在浏览器中打开地址:http://${INGRESS_EXTERNAL_IP}/checkout ,以触发调用 eshop 示例程序的 REST 接口。然后打开 TCM 的界面查看生成的分布式调用跟踪信息。
128 | 
129 |
130 | 从图中可以看到,在调用链中增加了两个 Span,分布对应于Kafka消息发送和接收的两个操作。由于Kafka消息的处理是异步的,消息发送端不直接依赖接收端的处理。根据 OpenTracing 对引用关系的定义,From_eshop_topic Span 对 To_eshop_topic Span 的引用关系是 FOLLOWS_FROM 而不是 CHILD_OF 关系。
131 |
132 | # 将调用跟踪上下文从Kafka传递到REST服务
133 |
134 | 现在 eshop 代码中已经加入了 REST 和 Kafka 的 OpenTracing Instrumentation,可以在进行 REST 调用和发送 Kafka 消息时生成调用跟踪信息。但如果需要从 Kafka 的消息消费者的处理方法中调用一个 REST 接口呢?
135 |
136 | 我们会发现在 eshop 示例程序中,缺省生成的调用链里面并不会把 Kafka 消费者的 Span 和其发起的调用 notification 服务的 REST 请求的 Span 关联在同一个 Trace 中。
137 |
138 | 要分析导致该问题的原因,我们首先需要了解[“Active Span”](https://opentracing.io/docs/overview/scopes-and-threading/)的概念。在 OpenTracing 中,一个线程可以有一个 Active Span,该 Active Span 代表了目前该线程正在执行的工作。在调用 Tracer.buildSpan() 方法创建新的 Span 时,如果 Tracer 目前存在一个 Active Span,则会将该 Active Span 缺省作为新创建的 Span 的 Parent Span。
139 |
140 | Tracer.buildSpan 方法的说明如下:
141 |
142 | ```java
143 | Tracer.SpanBuilder buildSpan(String operationName)
144 | Return a new SpanBuilder for a Span with the given `operationName`.
145 | You can override the operationName later via BaseSpan.setOperationName(String).
146 |
147 | A contrived example:
148 |
149 |
150 | Tracer tracer = ...
151 |
152 | // Note: if there is a `tracer.activeSpan()`, it will be used as the target of an implicit CHILD_OF
153 | // Reference for "workSpan" when `startActive()` is invoked.
154 | // 如果存在 active span,则其创建的新 Span 会隐式地创建一个 CHILD_OF 引用到该 active span
155 | try (ActiveSpan workSpan = tracer.buildSpan("DoWork").startActive()) {
156 | workSpan.setTag("...", "...");
157 | // etc, etc
158 | }
159 |
160 | // 也可以通过 asChildOf 方法指定新创建的 Span 的 Parent Span
161 | // It's also possible to create Spans manually, bypassing the ActiveSpanSource activation.
162 | Span http = tracer.buildSpan("HandleHTTPRequest")
163 | .asChildOf(rpcSpanContext) // an explicit parent
164 | .withTag("user_agent", req.UserAgent)
165 | .withTag("lucky_number", 42)
166 | .startManual();
167 | ```
168 |
169 | 分析 Kafka OpenTracing Instrumentation 的代码,会发现 TracingConsumerInterceptor 在调用 Kafka 消费者的处理方法之前已经把消费者的 Span 结束了,因此发起 REST 调用时 tracer 没有 active span,不会将 Kafka 消费者的 Span 作为后面 REST 调用的 parent span。
170 |
171 | ```java
172 | public static void buildAndFinishChildSpan(ConsumerRecord record, Tracer tracer,
173 | BiFunction consumerSpanNameProvider) {
174 | SpanContext parentContext = TracingKafkaUtils.extractSpanContext(record.headers(), tracer);
175 |
176 | String consumerOper =
177 | FROM_PREFIX + record.topic(); // <====== It provides better readability in the UI
178 | Tracer.SpanBuilder spanBuilder = tracer
179 | .buildSpan(consumerSpanNameProvider.apply(consumerOper, record))
180 | .withTag(Tags.SPAN_KIND.getKey(), Tags.SPAN_KIND_CONSUMER);
181 |
182 | if (parentContext != null) {
183 | spanBuilder.addReference(References.FOLLOWS_FROM, parentContext);
184 | }
185 |
186 | Span span = spanBuilder.start();
187 | SpanDecorator.onResponse(record, span);
188 |
189 | //在调用消费者的处理方法之前,该 Span 已经被结束。
190 | span.finish();
191 |
192 | // Inject created span context into record headers for extraction by client to continue span chain
193 | //这个 Span 被放到了 Kafka 消息的 header 中
194 | TracingKafkaUtils.inject(span.context(), record.headers(), tracer);
195 | }
196 | ```
197 |
198 | 此时 TracingConsumerInterceptor 已经将 Kafka 消费者的 Span 放到了 Kafka 消息的 header 中,因此从 Kafka 消息头中取出该 Span,显示地将 Kafka 消费者的 Span 作为 REST 调用的 Parent Span 即可。
199 |
200 | 为MessageConsumer.java使用的RestTemplate设置一个TracingKafka2RestTemplateInterceptor。
201 |
202 | ```java
203 | @KafkaListener(topics = "eshop-topic")
204 | public void receiveMessage(ConsumerRecord record) {
205 | restTemplate
206 | .setInterceptors(Collections.singletonList(new TracingKafka2RestTemplateInterceptor(record.headers())));
207 | restTemplate.getForEntity("http://notification:8080/sendEmail", String.class);
208 | }
209 | ```
210 |
211 | TracingKafka2RestTemplateInterceptor 是基于 Spring OpenTracing Instrumentation 的 TracingRestTemplateInterceptor 修改的,将从 Kafka header 中取出的 Span 设置为出向请求的 Span 的 Parent Span。
212 |
213 | ```java
214 | @Override
215 | public ClientHttpResponse intercept(HttpRequest httpRequest, byte[] body, ClientHttpRequestExecution xecution)
216 | throws IOException {
217 | ClientHttpResponse httpResponse;
218 | SpanContext parentSpanContext = TracingKafkaUtils.extractSpanContext(headers, tracer);
219 | Span span = tracer.buildSpan(httpRequest.getMethod().toString()).asChildOf(parentSpanContext)
220 | .withTag(Tags.SPAN_KIND.getKey(), Tags.SPAN_KIND_CLIENT).start();
221 | ......
222 | }
223 | ```
224 |
225 | 在浏览器中打开地址:http://${INGRESS_EXTERNAL_IP}/checkout ,以触发调用 eshop 示例程序的 REST 接口。然后打开 TCM 的界面查看生成的分布式调用跟踪信息。
226 | 
227 |
228 | 从上图可以看到,调用链中出现了 Kafka 消费者调用 notification 服务的 sendEmail REST 接口的 Span。从图中可以看到,由于调用链经过了 Kafka 消息,sendEmail Span 的时间没有包含在 checkout Span 中。
229 |
230 | # 总结
231 |
232 | Istio 服务网格通过分布式调用跟踪来提高微服务应用的可见性,这需要在应用程序中通过 HTTP header 传递调用跟踪的上下文。对于 JAVA 应用程序,我们可以使用 OpenTracing Instrumentation 来代替应用编码传递分布式跟踪的相关 http header,以减少对业务代码的影响;我们还可以将方法级的调用跟踪和 Kafka 消息的调用跟踪加入到 Istio 生成的调用跟踪链中,以为应用程序的故障定位提供更为丰富详细的调用跟踪信息。
233 |
234 | # 参考资料
235 |
236 | 1. [本文中 eshop 示例程序的源代码](https://github.com/aeraki-framework/method-level-tracing-with-istio)
237 |
238 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/content/zh/docs/best-practice/method-level-trcing/_index.md:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | title: "在 Istio 中实现方法级调用跟踪"
4 | linkTitle: "在 Istio 中实现方法级调用跟踪"
5 | weight: 4
6 | date: 2022-07-06
7 | description:
8 | ---
9 |
10 | 本文将通过一个网上商店的示例程序介绍如何利用 Spring 和 OpenTracing 简化应用程序的 Tracing 上下文传递,以及如何在 Istio 提供的进程间调用跟踪基础上实现方法级别的细粒度调用跟踪。
11 |
12 | # 分布式调用跟踪和 OpenTracing 规范
13 |
14 | ## 什么是分布式调用跟踪?
15 |
16 | 相比传统的“巨石”应用,微服务的一个主要变化是将应用中的不同模块拆分为了独立的进程。在微服务架构下,原来进程内的方法调用成为了跨进程的 RPC 调用。相对于单一进程的方法调用,跨进程调用的调试和故障分析是非常困难的,很难用传统的调试器或者日志打印来对分布式调用进行查看和分析。
17 | 
18 | 如上图所示,一个来自客户端的请求经过了多个微服务进程。如果要对该请求进行分析,则必须将该请求经过的所有服务的相关信息都收集起来并关联在一起,这就是“分布式调用跟踪”。
19 |
20 | ## 什么是 OpenTracing?
21 |
22 | ### CNCF OpenTracing 项目
23 |
24 | [OpenTracing](http://https://opentracing.io/)是[CNCF](https://www.cncf.io/)(云原生计算基金会)下的一个项目,其中包含了一套分布式调用跟踪的标准规范,各种语言的 API,编程框架和函数库。OpenTracing 的目的是定义一套分布式调用跟踪的标准,以统一各种分布式调用跟踪的实现。目前已有大量支持 [OpenTracing 规范的 Tracer 实现](https://opentracing.io/docs/supported-tracers/),包括 Jager,Skywalking,LightStep 等。在微服务应用中采用 OpenTracing API 实现分布式调用跟踪,可以避免 vendor locking,以最小的代价和任意一个兼容 OpenTracing 的基础设施进行对接。
25 |
26 | ### OpenTracing 概念模型
27 |
28 | OpenTracing 的概念模型参见下图:
29 |
30 | 
31 | 图源自 [https://opentracing.io/](https://opentracing.io/)
32 | 如图所示,OpenTracing 中主要包含下述几个概念:
33 |
34 | * Trace: 描述一个分布式系统中的端到端事务,例如来自客户端的一个请求。
35 | * Span:一个具有名称和时间长度的操作,例如一个 REST 调用或者数据库操作等。Span 是分布式调用跟踪的最小跟踪单位,一个 Trace 由多段 Span 组成。
36 | * Span context:分布式调用跟踪的上下文信息,包括 Trace id,Span id 以及其它需要传递到下游服务的内容。一个 OpenTracing 的实现需要将 Span context 通过某种序列化机制(Wire Protocol)在进程边界上进行传递,以将不同进程中的 Span 关联到同一个 Trace 上。这些 Wire Protocol 可以是基于文本的,例如 HTTP header,也可以是二进制协议。
37 |
38 | ### OpenTracing 数据模型
39 |
40 | 一个 Trace 可以看成由多个相互关联的 Span 组成的有向无环图(DAG 图)。下图是一个由 8 个 Span 组成的 Trace:
41 |
42 | ```
43 | [Span A] ←←←(the root span)
44 | |
45 | +------+------+
46 | | |
47 | [Span B] [Span C] ←←←(Span C is a `ChildOf` Span A)
48 | | |
49 | [Span D] +---+-------+
50 | | |
51 | [Span E] [Span F] >>> [Span G] >>> [Span H]
52 | ↑
53 | ↑
54 | ↑
55 | (Span G `FollowsFrom` Span F)
56 | ```
57 | 上图的 trace 也可以按照时间先后顺序表示如下:
58 | ```
59 | ––|–––––––|–––––––|–––––––|–––––––|–––––––|–––––––|–––––––|–> time
60 |
61 | [Span A···················································]
62 | [Span B··············································]
63 | [Span D··········································]
64 | [Span C········································]
65 | [Span E·······] [Span F··] [Span G··] [Span H··]
66 | ```
67 |
68 | Span 的数据结构中包含以下内容:
69 |
70 | * name: Span 所代表的操作名称,例如 REST 接口对应的资源名称。
71 | * Start timestamp: Span 所代表操作的开始时间
72 | * Finish timestamp: Span 所代表的操作的的结束时间
73 | * Tags:一系列标签,每个标签由一个 key value 键值对组成。该标签可以是任何有利于调用分析的信息,例如方法名,URL 等。
74 | * SpanContext:用于跨进程边界传递 Span 相关信息,在进行传递时需要结合一种序列化协议(Wire Protocol)使用。
75 | * References:该 Span 引用的其它关联 Span,主要有两种引用关系,Childof 和 FollowsFrom。
76 | * Childof: 最常用的一种引用关系,表示 Parent Span 和 Child Span 之间存在直接的依赖关系。例 RPC 服务端 Span 和 RPC 客户端 Span,或者数据库 SQL 插入 Span 和 ORM Save 动作 Span 之间的关系。
77 | * FollowsFrom:如果 Parent Span 并不依赖 Child Span 的执行结果,则可以用 FollowsFrom 表示。例如网上商店购物付款后会向用户发一个邮件通知,但无论邮件通知是否发送成功,都不影响付款成功的状态,这种情况则适用于用 FollowsFrom 表示。
78 |
79 | ### 跨进程调用信息传播
80 |
81 | SpanContext 是 OpenTracing 中一个让人比较迷惑的概念。在 OpenTracing 的概念模型中提到 SpanContext 用于跨进程边界传递分布式调用的上下文。但实际上 OpenTracing 只定义一个 SpanContext 的抽象接口,该接口封装了分布式调用中一个 Span 的相关上下文内容,包括该 Span 所属的 Trace id,Span id 以及其它需要传递到 downstream 服务的信息。SpanContext 自身并不能实现跨进程的上下文传递,需要由 Tracer(Tracer 是一个遵循 OpenTracing 协议的实现,如 Jaeger,Skywalking 的 Tracer)将 SpanContext 序列化后通过 Wire Protocol 传递到下一个进程中,然后在下一个进程将 SpanContext 反序列化,得到相关的上下文信息,以用于生成 Child Span。
82 |
83 | 为了为各种具体实现提供最大的灵活性,OpenTracing 只是提出了跨进程传递 SpanContext 的要求,并未规定将 SpanContext 进行序列化并在网络中传递的具体实现方式。各个不同的 Tracer 可以根据自己的情况使用不同的 Wire Protocol 来传递 SpanContext。
84 |
85 | 在基于 HTTP 协议的分布式调用中,通常会使用 HTTP Header 来传递 SpanContext 的内容。常见的 Wire Protocol 包含 Zipkin 使用的 [b3 HTTP header](https://github.com/openzipkin/b3-propagation),Jaeger 使用的 [uber-trace-id HTTP Header](https://www.jaegertracing.io/docs/1.7/client-libraries/#trace-span-identity),LightStep 使用的 "x-ot-span-context" HTTP Header 等。Istio/Envoy 支持 b3 header 和 x-ot-span-context header,可以和 Zipkin,Jaeger 及 LightStep 对接。其中 b3 HTTP header 的示例如下:
86 |
87 | ```
88 | X-B3-TraceId: 80f198ee56343ba864fe8b2a57d3eff7
89 | X-B3-ParentSpanId: 05e3ac9a4f6e3b90
90 | X-B3-SpanId: e457b5a2e4d86bd1
91 | X-B3-Sampled: 1
92 | ```
93 |
94 | # Istio 对分布式调用跟踪的支持
95 |
96 | Istio/Envoy 为微服务提供了开箱即用的分布式调用跟踪功能。在安装了 Istio 和 Envoy 的微服务系统中,Envoy 会拦截服务的入向和出向请求,为微服务的每个调用请求自动生成调用跟踪数据。通过在服务网格中接入一个分布式跟踪的后端系统,例如 Zipkin 或者 Jaeger,就可以查看一个分布式请求的详细内容,例如该请求经过了哪些服务,调用了哪个 REST 接口,每个 REST 接口所花费的时间等。
97 |
98 | 需要注意的是,Istio/Envoy 虽然在此过程中完成了大部分工作,但还是要求对应用代码进行少量修改:应用代码中需要将收到的上游 HTTP 请求中的 b3 header 拷贝到其向下游发起的 HTTP 请求的 header 中,以将调用跟踪上下文传递到下游服务。这部分代码不能由 Envoy 代劳,原因是 Envoy 并不清楚其代理的服务中的业务逻辑,无法将入向请求和出向请求按照业务逻辑进行关联。这部分代码量虽然不大,但需要对每一处发起 HTTP 请求的代码都进行修改,非常繁琐而且容易遗漏。当然,可以将发起 HTTP 请求的代码封装为一个代码库来供业务模块使用,来简化该工作。
99 |
100 | 下面以一个简单的网上商店示例程序来展示 Istio 如何提供分布式调用跟踪。该示例程序由 eshop,inventory,billing,delivery 几个微服务组成,结构如下图所示:
101 | 
102 | eshop 微服务接收来自客户端的请求,然后调用 inventory,billing,delivery 这几个后端微服务的 REST 接口来实现用户购买商品的 checkout 业务逻辑。本例的代码可以从 github 下载:https://github.com/aeraki-framework/method-level-tracing-with-istio
103 |
104 | 如下面的代码所示,我们需要在 eshop 微服务的应用代码中传递 b3 HTTP Header。
105 |
106 | ```java
107 | @RequestMapping(value = "/checkout")
108 | public String checkout(@RequestHeader HttpHeaders headers) {
109 | String result = "";
110 | // Use HTTP GET in this demo. In a real world use case,We should use HTTP POST
111 | // instead.
112 | // The three services are bundled in one jar for simplicity. To make it work,
113 | // define three services in Kubernets.
114 | result += restTemplate.exchange("http://inventory:8080/createOrder", HttpMethod.GET,
115 | new HttpEntity<>(passTracingHeader(headers)), String.class).getBody();
116 | result += "
";
117 | result += restTemplate.exchange("http://billing:8080/payment", HttpMethod.GET,
118 | new HttpEntity<>(passTracingHeader(headers)), String.class).getBody();
119 | result += "
";
120 | result += restTemplate.exchange("http://delivery:8080/arrangeDelivery", HttpMethod.GET,
121 | new HttpEntity<>(passTracingHeader(headers)), String.class).getBody();
122 | return result;
123 | }
124 | private HttpHeaders passTracingHeader(HttpHeaders headers) {
125 | HttpHeaders tracingHeaders = new HttpHeaders();
126 | extractHeader(headers, tracingHeaders, "x-request-id");
127 | extractHeader(headers, tracingHeaders, "x-b3-traceid");
128 | extractHeader(headers, tracingHeaders, "x-b3-spanid");
129 | extractHeader(headers, tracingHeaders, "x-b3-parentspanid");
130 | extractHeader(headers, tracingHeaders, "x-b3-sampled");
131 | extractHeader(headers, tracingHeaders, "x-b3-flags");
132 | extractHeader(headers, tracingHeaders, "x-ot-span-context");
133 | return tracingHeaders;
134 | }
135 |
136 | ```
137 |
138 | 下面我们来测试一下 eshop 实例程序。我们可以自己搭建一个 Kubernetes 集群并安装 Istio 以用于测试。这里为了方便,直接使用腾讯云上提供的全托管的服务网格 [TCM](https://console.cloud.tencent.com/tke2/mesh?rid=16),并在创建的 Mesh 中加入了一个容器服务 [TKE](https://console.cloud.tencent.com/tke2/cluster/startUp) 集群来进行测试。
139 |
140 | 在 TKE 集群中部署该程序,查看 Istio 分布式调用跟踪的效果。
141 |
142 | ```bash
143 | git clone git@github.com:aeraki-framework/method-level-tracing-with-istio.git
144 | cd method-level-tracing-with-istio
145 | git checkout without-opentracing
146 | kubectl apply -f k8s/eshop.yaml
147 | ```
148 |
149 | * 在浏览器中打开地址:http://${INGRESS_EXTERNAL_IP}/checkout ,以触发调用 eshop 示例程序的 REST 接口。
150 | * 在浏览器中打开 TCM 的界面,查看生成的分布式调用跟踪信息。
151 |
152 | TCM 图形界面直观地展示了这次调用的详细信息,可以看到客户端请求从 Ingressgateway 进入到系统中,然后调用了 eshop 微服务的 checkout 接口,checkout 调用有三个 child span,分别对应到 inventory,billing 和 delivery 三个微服务的 REST 接口。
153 | 
154 |
155 | # 使用 OpenTracing 来传递分布式跟踪上下文
156 |
157 | OpenTracing 提供了基于 Spring 的代码埋点,因此我们可以使用 OpenTracing Spring 框架来提供 HTTP header 的传递,以避免这部分硬编码工作。在 Spring 中采用 OpenTracing 来传递分布式跟踪上下文非常简单,只需要下述两个步骤:
158 |
159 | * 在 Maven POM 文件中声明相关的依赖,一是对 OpenTracing Spring Cloud Starter 的依赖;另外由于 Istio 采用了 Zipkin 的上报接口,我们也需要引入 Zipkin 的相关依赖。
160 | * 在 Spring Application 中声明一个 Tracer bean。如下所示,注意我们需要把 Istio 中的 Zipkin 上报地址设置到 OKHttpSernder 中。
161 |
162 | ```java
163 | @Bean
164 | public io.opentracing.Tracer zipkinTracer() {
165 | String zipkinEndpoint = System.getenv("ZIPKIN_ENDPOINT");
166 | if (zipkinEndpoint == null || zipkinEndpoint == ""){
167 | zipkinEndpoint = "http://zipkin.istio-system:9411/api/v2/spans";
168 | }
169 |
170 | OkHttpSender sender = OkHttpSender.create(zipkinEndpoint);
171 | Reporter spanReporter = AsyncReporter.create(sender);
172 |
173 | Tracing braveTracing = Tracing.newBuilder()
174 | .localServiceName("my-service")
175 | .propagationFactory(B3Propagation.FACTORY)
176 | .spanReporter(spanReporter)
177 | .build();
178 |
179 | Tracing braveTracer = Tracing.newBuilder()
180 | .localServiceName("spring-boot")
181 | .spanReporter(spanReporter)
182 | .propagationFactory(B3Propagation.FACTORY)
183 | .traceId128Bit(true)
184 | .sampler(Sampler.ALWAYS_SAMPLE)
185 | .build();
186 | return BraveTracer.create(braveTracer);
187 | }
188 | ```
189 |
190 | 部署采用 OpenTracing 进行 HTTP header 传递的程序版本,其调用跟踪信息如下所示:
191 | 
192 | 从上图中可以看到,相比在应用代码中直接传递 HTTP header 的方式,采用 OpenTracing 进行代码埋点后,相同的调用增加了 7 个名称前缀为 spring-boot 的 Span,这 7 个 Span 是由 OpenTracing 的 tracer 生成的。虽然我们并没有在代码中显示创建这些 Span,但 OpenTracing 的代码埋点会自动为每一个 REST 请求生成一个 Span,并根据调用关系关联起来。
193 |
194 | OpenTracing 生成的这些 Span 为我们提供了更详细的分布式调用跟踪信息,从这些信息中可以分析出一个 HTTP 调用从客户端应用代码发起请求,到经过客户端的 Envoy,再到服务端的 Envoy,最后到服务端接受到请求各个步骤的耗时情况。从图中可以看到,Envoy 转发的耗时在 1 毫秒左右,相对于业务代码的处理时长非常短,对这个应用而言,Envoy 的处理和转发对于业务请求的处理效率基本没有影响。
195 |
196 | # 在 Istio 调用跟踪链中加入方法级的调用跟踪信息
197 |
198 | Istio/Envoy 提供了跨服务边界的调用链信息,在大部分情况下,服务粒度的调用链信息对于系统性能和故障分析已经足够。但对于某些服务,需要采用更细粒度的调用信息来进行分析,例如一个 REST 请求内部的业务逻辑和数据库访问分别的耗时情况。在这种情况下,我们需要在服务代码中进行埋点,并将服务代码中上报的调用跟踪数据和 Envoy 生成的调用跟踪数据进行关联,以统一呈现 Envoy 和服务代码中生成的调用数据。
199 |
200 | 在方法中增加调用跟踪的代码是类似的,因此我们用 AOP + Annotation 的方式实现,以简化代码。
201 | 首先定义一个 Traced 注解和对应的 AOP 实现逻辑:
202 |
203 | ```java
204 | @Retention(RetentionPolicy.RUNTIME)
205 | @Target(ElementType.METHOD)
206 | @Documented
207 | public @interface Traced {
208 | }
209 | ```
210 | ```java
211 | @Aspect
212 | @Component
213 | public class TracingAspect {
214 | @Autowired
215 | Tracer tracer;
216 |
217 | @Around("@annotation(com.zhaohuabing.demo.instrument.Traced)")
218 | public Object aroundAdvice(ProceedingJoinPoint jp) throws Throwable {
219 | String class_name = jp.getTarget().getClass().getName();
220 | String method_name = jp.getSignature().getName();
221 | Span span = tracer.buildSpan(class_name + "." + method_name).withTag("class", class_name)
222 | .withTag("method", method_name).start();
223 | Object result = jp.proceed();
224 | span.finish();
225 | return result;
226 | }
227 | }
228 | ```
229 |
230 | 然后在需要进行调用跟踪的方法上加上 Traced 注解:
231 |
232 | ```java
233 | @Component
234 | public class DBAccess {
235 |
236 | @Traced
237 | public void save2db() {
238 | try {
239 | Thread.sleep((long) (Math.random() * 100));
240 | } catch (InterruptedException e) {
241 | e.printStackTrace();
242 | }
243 | }
244 | }
245 | ```
246 |
247 | ```java
248 | @Component
249 | public class BankTransaction {
250 | @Traced
251 | public void transfer() {
252 | try {
253 | Thread.sleep((long) (Math.random() * 100));
254 | } catch (InterruptedException e) {
255 | e.printStackTrace();
256 | }
257 | }
258 | }
259 | ```
260 |
261 | demo 程序的 master branch 已经加入了方法级代码跟踪,可以直接部署。
262 |
263 | ```bash
264 | git checkout master
265 | kubectl apply -f k8s/eshop.yaml
266 | ```
267 |
268 | 效果如下图所示,可以看到 trace 中增加了 transfer 和 save2db 两个方法级的 Span。
269 | 
270 | 可以打开一个方法的 Span,查看详细信息,包括 Java 类名和调用的方法名等,在 AOP 代码中还可以根据需要添加出现异常时的异常堆栈等信息。
271 | 
272 | # 总结
273 |
274 | Istio/Envoy 为微服务应用提供了分布式调用跟踪功能,提高了服务调用的可见性。我们可以使用 OpenTracing 来代替应用硬编码,以传递分布式跟踪的相关 http header;还可以通过 OpenTracing 将方法级的调用信息加入到 Istio/Envoy 缺省提供的调用链跟踪信息中,以提供更细粒度的调用跟踪信息。
275 |
276 | # 下一步
277 |
278 | 除了同步调用之外,异步消息也是微服务架构中常见的一种通信方式。在下一篇文章中,我将继续利用 eshop demo 程序来探讨如何通过 OpenTracing 将 Kafka 异步消息也纳入到 Istio 的分布式调用跟踪中。
279 |
280 | # 参考资料
281 |
282 | 1. [本文中 eshop 示例程序的源代码](https://github.com/aeraki-framework/method-level-tracing-with-istio)
283 | 1. [Opentracing docs](https://opentracing.io/docs/)
284 | 1. [Opentracing specification](https://github.com/opentracing/specification/blob/master/specification.md)
285 | 1. [Opentracing wire protocols](https://github.com/opentracing/specification/blob/master/rfc/trace_identifiers.md)
286 | 1. [Istio Trace context propagation](https://istio.io/docs/tasks/telemetry/distributed-tracing/overview/#trace-context-propagation)
287 | 1. [Zipkin-b3-propagation](https://github.com/apache/incubator-zipkin-b3-propagation)
288 | 1. [OpenTracing Project Deep Dive](https://www.youtube.com/watch?v=ySR_FVNX4bQ&t=184s)
--------------------------------------------------------------------------------